In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
#from sklearn import cross_validation
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction import stop_words
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve

%matplotlib inline

In [2]:
data = pd.read_csv('train.csv', index_col = 0)

In [3]:
data_small = data.sample(100000, replace = False)

In [4]:
stop =  stop_words.ENGLISH_STOP_WORDS
stemmer = SnowballStemmer('english')
data_stemmed = data.copy()
data_stemmed['question_text'] = [' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in data_stemmed.question_text]

In [5]:
def data_prep(dataset, training_split, test_split):
    X = dataset['question_text']
    Y = dataset['target']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = training_split, test_size = test_split)
    
    return(X_train, X_test, Y_train, Y_test)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax() #not sure this is working
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)


def downsample(df):
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target==1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=df_minority.shape[0],     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
    
    return (df_downsampled)

In [6]:
downsample_data_regular = data_prep(downsample(data), .9, .1)
downsample_data_stemmed = data_prep(downsample(data_stemmed), .9,.1)

In [7]:
test = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,2), None, LogisticRegression())

In [12]:
fscores = []
thresholds = []
fscores.append(test[0])
thresholds.append(test[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.897366,0.431035


In [13]:
test2 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,2), None, BernoulliNB())

In [14]:
fscores = []
thresholds = []
fscores.append(test2[0])
thresholds.append(test2[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.872824,0.041563


In [16]:
test3 = model_vectorize(downsample_data_stemmed, CountVectorizer, True, (1,2), None, BernoulliNB())
test4 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,3), None, BernoulliNB())
test5 = model_vectorize(downsample_data_regular, CountVectorizer, True, (1,3), stop, BernoulliNB())
test6 = model_vectorize(downsample_data_regular, TfidfVectorizer, False, (1,2), None, BernoulliNB())

In [17]:
fscores = []
thresholds = []
for i in [test, test2, test3, test4, test5, test6]:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,threshold
0,0.897366,0.431035
4,0.878318,0.0045
1,0.872824,0.041563
5,0.872824,0.041563
3,0.872032,0.000451
2,0.871758,0.037112


In [27]:
len(downsample_data_regular)
#data.target.value_counts()
#len(downsample_data_regular[0]), len(downsample_data_regular[1]), len(downsample_data_regular[2]), len(downsample_data_regular[3])

4

In [18]:
#create models
dataset = [downsample_data_regular, downsample_data_stemmed]
vectorizer_type = [CountVectorizer, TfidfVectorizer]
binary_type = [True, False]
ngram = [(1,2), (1,3), (1,4)]
stop_word = [None, stop]
model_type = [LogisticRegression(), BernoulliNB()]

model_initialize = []
for h in dataset:
    for i in vectorizer_type:
        for j in binary_type:
            for k in ngram:
                for l in model_type:
                    for m in stop_word:
                        model_initialize.append(model_vectorize(h, i, j, k, m, l))
                    
#create labels
dataset_label = ['Downsample Full Regular Data', 'Downsample Full Stemmed Data']
vectorizer_type_label = ['CountV', 'TFIDV']
binary_type_label = ['T', 'F']
ngram_label = [(1,2), (1,3), (1,4)]
stop_word_label = ['None', 'english']
model_type_label = ['LR', 'NB']

labels = []
for h in dataset_label:    
    for i in vectorizer_type_label:
        for j in binary_type_label:
            for k in ngram_label:
                for l in model_type_label:
                    for m in stop_word_label:
                        label = '%s %s %s %s %s %s' %(h, i, j, k, m, l)
                        labels.append(label)
                        
fscores = []
thresholds = []
for i in model_initialize:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'label': labels,'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)



Unnamed: 0,fscore,label,threshold
0,0.897366,"Downsample Full Regular Data CountV T (1, 2) N...",4.310353e-01
16,0.897256,"Downsample Full Regular Data CountV F (1, 3) N...",3.812638e-01
4,0.896673,"Downsample Full Regular Data CountV T (1, 3) N...",3.497340e-01
12,0.896654,"Downsample Full Regular Data CountV F (1, 2) N...",4.275448e-01
20,0.896248,"Downsample Full Regular Data CountV F (1, 4) N...",3.779705e-01
64,0.895887,"Downsample Full Stemmed Data CountV F (1, 3) N...",4.057508e-01
48,0.895531,"Downsample Full Stemmed Data CountV T (1, 2) N...",3.640555e-01
8,0.895427,"Downsample Full Regular Data CountV T (1, 4) N...",3.883771e-01
68,0.895323,"Downsample Full Stemmed Data CountV F (1, 4) N...",3.468663e-01
52,0.895294,"Downsample Full Stemmed Data CountV T (1, 3) N...",3.958413e-01
