In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
#from sklearn import cross_validation
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction import stop_words
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve
from sklearn.utils import shuffle

%matplotlib inline

['test.csv', 'train.csv']


In [4]:
data = pd.read_csv("../input/train.csv", index_col = 0)
test_data = pd.read_csv("../input/test.csv", index_col = 0)

In [51]:
test = data_split(data, 0.9, 0.1)

In [11]:
def data_split(dataset, training_split, test_split):
    train, test = train_test_split(dataset, train_size = training_split, test_size = test_split)
    return(train, test)

def data_prep(train, test):
    X_train = train['question_text']
    Y_train = train['target']
    
    X_test = test['question_text']
    Y_test = test['target']
    
    return(X_train, X_test, Y_train, Y_test)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax() #not sure this is working
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)


def downsample(df):
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target==1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=df_minority.shape[0],     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])
    
    return (df_downsampled)

def run_model(train_data, test_data, vectorizer_type, binary_type, ngram, stop_word, model_type, max_t):
    X_train = train_data['question_text']
    Y_train = train_data['target']
    X_test = test_data['question_text']
    
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    predictions = model.predict_proba(X_test_vectorized)[:,1]
    max_threshold = max_t
    targets = []
    for i in predictions:
        if i<= max_threshold:
            targets.append(0)
        else:
            targets.append(1)
    return(predictions, targets)


def model_vectorize(data_used, vectorizer_type, binary_type, ngram, stop_word, model_type):
    X_train = data_used[0]
    X_test = data_used[1]
    Y_train = data_used[2]
    Y_test = data_used[3]
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    scores = model.predict_proba(X_test_vectorized)[:,1]
    precision, recall, thresholds = precision_recall_curve(Y_test, scores)
    precision, recall = precision[:-1], recall[:-1]
    fscores = 2*np.divide(np.multiply(precision, recall), np.add(precision, recall))
    max_fscore = np.nanmax(fscores)
    ind_max = fscores.argmax() #not sure this is working
    threshold_max = thresholds[ind_max]
    
    return(max_fscore, threshold_max)

In [7]:
train_test = data_split(data, 0.9, 0.1)
downsample_train = downsample(train_test[0])
data_downsample_regular = data_prep(downsample_train, train_test[1])

stop =  stop_words.ENGLISH_STOP_WORDS
stemmer = SnowballStemmer('english')
data_stemmed = data.copy()
data_stemmed['question_text'] = [' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in data_stemmed.question_text]
train_test_stemmed = data_split(data_stemmed, 0.9, 0.1)
downsample_train_stemmed = downsample(train_test_stemmed[0])
data_downsample_stemmed = data_prep(downsample_train_stemmed, train_test_stemmed[1])

# test = model_vectorize(X_Y, CountVectorizer, True, (1,2), None, LogisticRegression())
# test

In [8]:
#create models
dataset = [data_downsample_regular, data_downsample_stemmed]
vectorizer_type = [CountVectorizer, TfidfVectorizer]
binary_type = [True, False]
ngram = [(1,2), (1,3), (1,4)]
stop_word = [None, stop]
model_type = [LogisticRegression(), BernoulliNB()]

model_initialize = []
for h in dataset:
    for i in vectorizer_type:
        for j in binary_type:
            for k in ngram:
                for l in model_type:
                    for m in stop_word:
                        model_initialize.append(model_vectorize(h, i, j, k, m, l))
                    
#create labels
dataset_label = ['Downsample Full Regular Data', 'Downsample Full Stemmed Data']
vectorizer_type_label = ['CountV', 'TFIDV']
binary_type_label = ['T', 'F']
ngram_label = [(1,2), (1,3), (1,4)]
stop_word_label = ['None', 'english']
model_type_label = ['LR', 'NB']

labels = []
for h in dataset_label:    
    for i in vectorizer_type_label:
        for j in binary_type_label:
            for k in ngram_label:
                for l in model_type_label:
                    for m in stop_word_label:
                        label = '%s %s %s %s %s %s' %(h, i, j, k, m, l)
                        labels.append(label)
                        
fscores = []
thresholds = []
for i in model_initialize:
    fscores.append(i[0])
    thresholds.append(i[1])
metrics = pd.DataFrame({'label': labels,'fscore': fscores, 'threshold': thresholds})
metrics.sort_values(by = ['fscore'], ascending=False)



Unnamed: 0,fscore,label,threshold
56,0.624874,"Downsample Full Stemmed Data CountV T (1, 4) N...",0.883883
52,0.622921,"Downsample Full Stemmed Data CountV T (1, 3) N...",0.887005
68,0.621310,"Downsample Full Stemmed Data CountV F (1, 4) N...",0.901165
64,0.621086,"Downsample Full Stemmed Data CountV F (1, 3) N...",0.892917
48,0.617998,"Downsample Full Stemmed Data CountV T (1, 2) N...",0.859913
60,0.617468,"Downsample Full Stemmed Data CountV F (1, 2) N...",0.874612
4,0.616083,"Downsample Full Regular Data CountV T (1, 3) N...",0.895167
16,0.615935,"Downsample Full Regular Data CountV F (1, 3) N...",0.896621
8,0.615808,"Downsample Full Regular Data CountV T (1, 4) N...",0.895197
0,0.614927,"Downsample Full Regular Data CountV T (1, 2) N...",0.877812


In [10]:
pd.set_option('max_colwidth', 140)
metrics.sort_values(by = ['fscore'], ascending=False)

Unnamed: 0,fscore,label,threshold
56,0.624874,"Downsample Full Stemmed Data CountV T (1, 4) None LR",0.883883
52,0.622921,"Downsample Full Stemmed Data CountV T (1, 3) None LR",0.887005
68,0.621310,"Downsample Full Stemmed Data CountV F (1, 4) None LR",0.901165
64,0.621086,"Downsample Full Stemmed Data CountV F (1, 3) None LR",0.892917
48,0.617998,"Downsample Full Stemmed Data CountV T (1, 2) None LR",0.859913
60,0.617468,"Downsample Full Stemmed Data CountV F (1, 2) None LR",0.874612
4,0.616083,"Downsample Full Regular Data CountV T (1, 3) None LR",0.895167
16,0.615935,"Downsample Full Regular Data CountV F (1, 3) None LR",0.896621
8,0.615808,"Downsample Full Regular Data CountV T (1, 4) None LR",0.895197
0,0.614927,"Downsample Full Regular Data CountV T (1, 2) None LR",0.877812


In [14]:
stop =  stop_words.ENGLISH_STOP_WORDS
stemmer = SnowballStemmer('english')
data_stemmed = data.copy()

data_stemmed['question_text'] = [' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in data_stemmed.question_text]
downsample_train_stemmed = downsample(data_stemmed)

test_stemmed = test_data.copy()
test_stemmed['question_text'] = [' '.join([stemmer.stem(word) for word in text.split(' ')])
          for text in test_stemmed.question_text]
test = run_model(downsample_train_stemmed, test_stemmed, CountVectorizer, True, (1,4), None, LogisticRegression(), 0.883883)

In [26]:
test_results = pd.DataFrame()
test_results['qid'] = test_data.index
test_results['prediction_percent'] = test[0]
test_results['prediction'] = test[1]
results = test_results.drop('prediction_percent', 1)
results.to_csv('sample_submission4.csv', index = False)