In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
#from sklearn import cross_validation
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction import stop_words
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve

%matplotlib inline

In [11]:
data = pd.read_csv("../input/train.csv", index_col = 0)

In [17]:
pd.set_option('max_colwidth', 140)
data[100:200]

Unnamed: 0_level_0,question_text,target
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
00043c2c68e74328c456,"What do physicists, mathematicians, computer scientists and philosophers think of David Deutsch's 'Constructor Theory'?",0
00043d911af1cfbdb5f3,Why are old scriptures from eastern cultures appear lost in the current culture?,0
000441059c27001eb255,"Can I know my I.Q, even if I hate numbers?",0
00045f3b9fcb27975e26,How can I really make up my mind and get rid of my bad habits like procrastination?,0
00046512985c0996339e,Was there any relationship between Napoleon and Ali Pasha of Tepelene?,0
000467723d6f04760035,Where are presynaptic neurons found?,0
000477ab08d14b6a047d,What ways will a narcissist mother punish her child for going no contact if child goes back to contact with her?,0
000485e6dd4b149fe051,Can I start freelancing after finishing Udacity's Android basic nanodegree?,0
000488ff2dbaa802b4d9,What is the reason why we really need Bitcoin?,0
0004a41beea5f02d85ef,What are some good songs for a long journey?,0


In [3]:
data_small = data.sample(100000, replace = False)

In [4]:
stop =  stop_words.ENGLISH_STOP_WORDS
stemmer = SnowballStemmer('english')
data_stemmed = data.copy()
data_stemmed['question_text'] = [' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in data_stemmed.question_text]

In [5]:
def data_prep(dataset, training_split, test_split):
    X = dataset['question_text']
    Y = dataset['target']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = training_split, test_size = test_split)
    
    return(X_train, X_test, Y_train, Y_test)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax() #not sure this is working
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)


def downsample(df):
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target==1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=df_minority.shape[0],     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
    
    return (df_downsampled)

In [6]:
downsample_data_regular = data_prep(downsample(data), .9, .1)
downsample_data_stemmed = data_prep(downsample(data_stemmed), .9,.1)

In [7]:
test = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,2), None, LogisticRegression())

In [12]:
fscores = []
thresholds = []
fscores.append(test[0])
thresholds.append(test[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.897366,0.431035


In [13]:
test2 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,2), None, BernoulliNB())

In [14]:
fscores = []
thresholds = []
fscores.append(test2[0])
thresholds.append(test2[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.872824,0.041563


In [16]:
test3 = model_vectorize(downsample_data_stemmed, CountVectorizer, True, (1,2), None, BernoulliNB())
test4 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,3), None, BernoulliNB())
test5 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,3), stop, BernoulliNB())
test6 = model_vectorize(downsample_data_regular, TfidfVectorizer, False, (1,2), None, BernoulliNB())

In [17]:
fscores = []
thresholds = []
for i in [test, test2, test3, test4, test5, test6]:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.897366,0.431035
4,0.878318,0.0045
1,0.872824,0.041563
5,0.872824,0.041563
3,0.872032,0.000451
2,0.871758,0.037112


In [27]:
len(downsample_data_regular)
#data.target.value_counts()
#len(downsample_data_regular[0]), len(downsample_data_regular[1]), len(downsample_data_regular[2]), len(downsample_data_regular[3])

4

In [None]:
#create models
dataset = [downsample_data_regular, downsample_data_stemmed]
vectorizer_type = [CountVectorizer, TfidfVectorizer]
binary_type = [True, False]
ngram = [(1,2), (1,3), (1,4)]
stop_word = [None, stop]
model_type = [LogisticRegression(), BernoulliNB()]

model_initialize = []
for h in dataset:
    for i in vectorizer_type:
        for j in binary_type:
            for k in ngram:
                for l in model_type:
                    for m in stop_word:
                        model_initialize.append(model_vectorize(h, i, j, k, m, l))
                    
#create labels
dataset_label = ['Downsample Full Regular Data', 'Downsample Full Stemmed Data']
vectorizer_type_label = ['CountV', 'TFIDV']
binary_type_label = ['T', 'F']
ngram_label = [(1,2), (1,3), (1,4)]
stop_word_label = ['None', 'english']
model_type_label = ['LR', 'NB']

labels = []
for h in dataset_label:    
    for i in vectorizer_type_label:
        for j in binary_type_label:
            for k in ngram_label:
                for l in model_type_label:
                    for m in stop_word_label:
                        label = '%s %s %s %s %s %s' %(h, i, j, k, m, l)
                        labels.append(label)
                        
fscores = []
thresholds = []
for i in model_initialize:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'label': labels,'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

In [35]:
def downsample2(df):
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target==1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=df_minority.shape[0],     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
    df_not_used = pd.merge(df_majority, df_majority_downsampled, how = 
    return (df_downsampled, df_not_used)

In [38]:
test_data = pd.read_csv('test.csv', index_col = 0)

In [70]:
def run_model(train_data, test_data, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = train_data['question_text']
    Y_train = train_data['target']
    X_test = test_data['question_text']
    
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    predictions = model.predict_proba(X_test_vectorized)[:,1]
    max_threshold = .4310353
    targets = []
    for i in predictions:
        if i<= max_threshold:
            targets.append(0)
        else:
            targets.append(1)
    
    
    
    return(predictions, targets)

In [69]:
downsampled = downsample(data)
test = run_model(downsampled, test_data, CountVectorizer, True, (1,2), None, LogisticRegression())


In [90]:
test_results = pd.DataFrame()
test_results['index'] = test_data.index
test_results['prediction_percent'] = test[0]
test_results['prediction'] = test[1]

In [94]:
results = test_results.drop('prediction_percent', 1)
publish.to_csv('team2_submission.csv', index = False)

In [95]:
results

Unnamed: 0,index,prediction
0,00014894849d00ba98a9,0
1,000156468431f09b3cae,0
2,000227734433360e1aae,0
3,0005e06fbe3045bd2a92,1
4,00068a0f7f41f50fc399,0
5,000a2d30e3ffd70c070d,1
6,000b67672ec9622ff761,0
7,000b7fb1146d712c1105,0
8,000d665a8ddc426a1907,0
9,000df6fd2229447b2969,0


In [1]:
import string, re

In [20]:
s = "This isn't right is it?"
text = ''.join(ch for ch in s if ch not in string.punctuation)
text

'This isnt right is it'

In [None]:
#isn't
#what's
#it's
#don't
#doesn't
#I'm

def clean(text):
    
    # Remove puncuation
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    
    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub('[^a-zA-Z]',' ', text)
    text = re.sub('  +',' ',text)
    
    #text = text.split()
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    #text = " ".join(stemmed_words)
    return text