## Pereparing the machine & importing the necessary modules 

In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('ignore')

## Preparing the training and validation datasets 

In [2]:
data=pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
# Checking to see the number of insincere questions
# We see that only about 6% of the total questions have been labeled insincere
# As expected, this is an imbalanced classification problem

data['target'].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [4]:
# Seeing a sample of questions labeled as insincere

data[data.target==1].sample(5)

Unnamed: 0,qid,question_text,target
363125,472cf5fdc5dc8218c6d8,"Why are new vehicles so costly, considering le...",1
1285452,fbed5ed869a01957c90f,"Is Trump's contemplating, suggestive remark ab...",1
636169,7c9a92b811724c828308,Are Iranians busy with producing atomic bombs ...,1
460846,5a3f3f8f96a59560fe5c,What is Trump' s reaction to the fact that the...,1
688968,86f25609f8aa68938fef,Are all the Quora users fucking dumb or is Quo...,1


In [5]:
trainDF = pd.DataFrame()
trainDF['text'] = data['question_text']
trainDF['label'] = data['target']

from sklearn.model_selection import train_test_split

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                      stratify=trainDF['label'], test_size=0.20)

## Feature Engineering - transforming free form text into structured numerical data

In [6]:
## Count-Vectorizer: Using a simple bag of words model

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [7]:
## TF-IDF: Using Term Frequency - Inverse Document Frequency to extract features
## Better than a simple Bag-of-Words transformation, as we get higer weights for words that tend to distinguish the text

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

## Building predictive models

In [8]:
# Given the data is imbalanced, prediction accuracy is not going to be a good metric to evaluate model performances
# Furthermore, it is much more important that we don't predict insincere questions as sincere than the other way around
# Thus, Recall is the metric of interest

# To avoid re-typing, here I'm writing a function that takes in the classifier (model), training set, training labels(target)
# & validation set,and trains the classifier on the training set and the training labels, predicts the labels on the 
# validation set, and returns classification reports that show model performance.

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.classification_report(valid_y, predictions) 

### Naive-Bayes Model 

In [9]:
# The main advantae of Naive-Bayes Classifier is that it trains really fast

# Naive Bayes on Count Vectors
print('Classification Report on the model trained on count vectors (Bag-of-Words)')
print()
print(train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count))

print()

# Naive Bayes on Word Level TF IDF Vectors
print('Classification Report on the model trained with TF-IDF vectors')
print()
print(train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf))

Classification Report on the model trained on count vectors (Bag-of-Words)

              precision    recall  f1-score   support

           0       0.98      0.95      0.96    245063
           1       0.46      0.69      0.55     16162

   micro avg       0.93      0.93      0.93    261225
   macro avg       0.72      0.82      0.76    261225
weighted avg       0.95      0.93      0.94    261225


Classification Report on the model trained with TF-IDF vectors

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    245063
           1       0.71      0.23      0.35     16162

   micro avg       0.95      0.95      0.95    261225
   macro avg       0.83      0.61      0.66    261225
weighted avg       0.94      0.95      0.93    261225



### Linear (Logistic Regression) Model 

In [10]:
# Simple linear model with a linear separation between categories, however trains slower than the NB classifier

# Linear Classifier on Count Vectors
print('Classification Report on the model trained on count vectors (Bag-of-Words)')
print()
print(train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count))

print()

# Linear Classifier on Word Level TF IDF Vectors
print('Classification Report on the model trained with TF-IDF vectors')
print()
print(train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf))

Classification Report on the model trained on count vectors (Bag-of-Words)

              precision    recall  f1-score   support

           0       0.96      0.99      0.98    245063
           1       0.70      0.46      0.55     16162

   micro avg       0.95      0.95      0.95    261225
   macro avg       0.83      0.72      0.76    261225
weighted avg       0.95      0.95      0.95    261225


Classification Report on the model trained with TF-IDF vectors

              precision    recall  f1-score   support

           0       0.96      0.99      0.97    245063
           1       0.70      0.40      0.51     16162

   micro avg       0.95      0.95      0.95    261225
   macro avg       0.83      0.70      0.74    261225
weighted avg       0.95      0.95      0.95    261225



#### Based on Recall scores on insincere questions (label=1), Naive-Bayes Model with TF-IDF performs the best (Recall = 0.71).