In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords, brown
from nltk import word_tokenize
from nltk.util import ngrams
import math
stop_words = set(stopwords.words('english'))
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
%matplotlib inline

In [None]:
#importing the data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ", train_df.shape)
print("Test shape : ", test_df.shape)

#### Baseline Model

In [None]:
#nlp/machine learning libraries
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,GridSearchCV
from sklearn.metrics import f1_score,classification_report,roc_curve,precision_recall_curve,auc,average_precision_score
from sklearn.feature_selection import chi2, SelectKBest
import re
import pandas, xgboost, numpy, textblob, string
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [None]:
#display top 5 rows
train_df.head()

In [None]:
test_df.head()

### Using only text column (question_text) for building models

In [None]:
#features
X = train_df['question_text']
#target label
Y = train_df['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

## Baseline model - Logistic Regression

In [None]:
#pipeline for creating tf idf and  basic logistic regression model
baseline_ngram_lr = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words=stop_words,ngram_range=(1,3))),
                    ('classifier', LogisticRegression()),
                    ])

In [None]:
#fitting the pipeline to the train data
baseline_ngram_lr.fit(X_train, y_train )

In [None]:
baseline_ngram_lr_preds = baseline_ngram_lr.predict(X_test)
print(classification_report(y_test, baseline_ngram_lr_preds))

In [None]:
baseline_ngram_lr_preds_prob = baseline_ngram_lr.predict_proba(X_test)[:,1]

#### Choosing Optimal threshold with better F1 score

In [None]:
f1_list = []
for threshold in np.arange(0.1, 0.6, 0.01):
    threshold = np.round(threshold, 2)
    f1_list.append((f1_score(y_test, (baseline_ngram_lr_preds_prob>threshold).astype(int)),threshold))
    print("F1 score at threshold {0} is {1}".format(threshold, f1_score(y_test, (baseline_ngram_lr_preds_prob>threshold).astype(int))))

In [None]:
def sort_tuple(tup):
    return tup[0]

best_threshold = sorted(f1_list,key=sort_tuple, reverse=True)[0][1]

In [None]:
##creating a submission file with the optimal threshold with the baseline model
def submission(df, predictions, file_name, threshold=0.20):
    print('Optimal threshold with better F1 score is: ', threshold)
    results = (predictions > threshold).astype(int)
    df['prediction'] = results
    file = (file_name + '.csv')
    df.to_csv(file, index=False)

In [None]:
#predicting the classes on test data
baseline_ngram_lr_preds_prob = baseline_ngram_lr.predict_proba(test_df['question_text'])

In [None]:
print('Saving the results in the submission file')
sub_df = pd.read_csv('../input/sample_submission.csv')
submission(sub_df, baseline_ngram_lr_preds_prob, 'submission', threshold=best_threshold)

In [None]:
print("At threshold {0}, we are getting better F1 score and we will be choosing this threshold for our submission. This is our baseline and we will try to beat this score".format(best_threshold))

## Random Forest

In [None]:
#pipeline for creating tf idf and  naive bayes model
random_forest = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words=stop_words,ngram_range=(1,3))),
                    ('classifier', RandomForestClassifier()),
                    ])

In [None]:
#fitting the pipeline to the train data
random_forest.fit(X_train, y_train)

In [None]:
random_forest_preds = random_forest.predict(X_test)
print(classification_report(y_test, random_forest_preds))

In [None]:
random_forest_preds_prob = random_forest.predict_proba(X_test)[:,1]

#### Choosing Optimal threshold with better F1 score


In [None]:
f1_list = []
for threshold in np.arange(0.1, 0.8, 0.01):
    threshold = np.round(threshold, 2)
    f1_list.append((f1_score(y_test, (random_forest_preds_prob>threshold).astype(int)),threshold))
    print("F1 score at threshold {0} is {1}".format(threshold, f1_score(y_test, (random_forest_preds_prob>threshold).astype(int))))

In [None]:
def sort_tuple(tup):
    return tup[0]

best_threshold = sorted(f1_list,key=sort_tuple, reverse=True)[0][1]

In [None]:
#predicting the classes on test data
random_forest_preds_prob = random_forest.predict_proba(test_df['question_text'])

In [None]:
print('Saving the results in the submission file')
sub_df = pd.read_csv('../input/sample_submission.csv')
submission(sub_df, random_forest_preds_prob, 'submission', threshold=best_threshold)

In [None]:
print("At threshold {0} we are getting better F1 score and we will be choosing this threshold for our submission.".format(best_threshold))

### Precision Recall Curves

In [None]:
baseline_ngram_lr_preds_prob = baseline_ngram_lr.predict_proba(X_test)[:,1]
random_forest_preds_prob = random_forest.predict_proba(X_test)[:,1]


In [None]:
classfier_pred_list = [baseline_ngram_lr_preds_prob,random_forest_preds_prob]
classifiers_list = ['Logistic Regression Ngrams','Random Forest Ngrams']
count=0
for classifier,col in zip(classfier_pred_list,'gr'):
    p,r,_ = precision_recall_curve(y_test,classifier)   
    plt.plot(r,p,c=col,label=classifiers_list[count])
    count += 1
plt.legend(loc='lower left')   
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()