## Importing the necessary modules, instantiating the spacy pipeline object (large model)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

import spacy
nlp=spacy.load('en_core_web_lg', disable=['ner', 'parser', 'tagger'])

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm, metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in the training dataset

data = pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
# Checking the distribution of sincere and insincere questions in the training data
data.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

## Oversampling minority class, taking a smaller subset of the overall training data 

In [4]:
# Taking a sample of 10,000 sincere question and 10,000 insincere questions:
# This achieves two goals, oversampling minority class, and faster execution/training of the model

sincere = data[data.target==0].sample(10000)
insincere = data[data.target==1].sample(10000)

In [5]:
df = pd.concat((sincere, insincere), axis=0)
print(df.shape)
print(df.target.value_counts())
print(df.isnull().sum())

df.head()

(20000, 3)
1    10000
0    10000
Name: target, dtype: int64
qid              0
question_text    0
target           0
dtype: int64


Unnamed: 0,qid,question_text,target
566286,6ef4864ef60db4da6244,Can I use a small coreless DC motor for a smal...,0
111513,15d339346370ac116b1c,Are pleaidians and reptilian aliens real?,0
403844,4f2324bdd8a3a166ede6,What is boiler draught and state its necessity?,0
292724,3951fd8f20403cb958eb,"What is a good analysis of the book ""Economic ...",0
403488,4f0f4264d8dce4a073b9,What is the shortlisting criteria of Accenture...,0


In [6]:
# Insincere questions: 
df[df.target==1].sample(5)

Unnamed: 0,qid,question_text,target
789020,9a96f0e83f5f5dcff2f6,Should I strap a board to my asshole so I don'...,1
7704,01806955ffe7eb0a6315,Why would any American want a socialist dictat...,1
1283732,fb950dc188a0b690c9e6,Why is it easier to have a mindless gossips wi...,1
306845,3c1a0a49a46091af7d71,Do psychopaths sexually molest their younger s...,1
935188,b74444829f11b0061efe,Are Bengalees more arrogrant compate to the ot...,1


## Feature Engineering - vector representations of each question
#### Sentence level vectors are obtained by averaging the constituent word vectors. Spacy's pipeline object does this transformation in the background and the vector can be accessed after the text is sent through the pipeline.

In [7]:
vectors=[]

for line in df.question_text:
    vectors.append(nlp(line).vector)

In [8]:
dat = pd.DataFrame(vectors, columns = [i for i in range(1,301)])
dat['target'] = list(df.target)

In [9]:
dat.target.value_counts()

1    10000
0    10000
Name: target, dtype: int64

In [10]:
# Splitting the data into training and validation sets

from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(dat.drop('target', axis=1), dat['target'], test_size=0.25)

In [11]:
# Function to fit & predict data, and to return the classification report 
# The metric of interest here is Recall on Insincere Questions (How many of insincere questions are actually being identified?)

def train_model(classifier, train_features, train_target, valid_features):
    classifier.fit(train_features, train_target)
    predictions = classifier.predict(valid_features)
    return metrics.classification_report(valid_y, predictions)    

In [12]:
# Sanity Check

print(train_x.shape)
print(valid_x.shape)
print(train_y.shape)
print(valid_y.shape)
print()
train_y.value_counts()

(15000, 300)
(5000, 300)
(15000,)
(5000,)



1    7511
0    7489
Name: target, dtype: int64

In [13]:
# SVC (Support Vector Classifier) model - takes long time to train

print("SCV model Classification Report: \n", train_model(svm.SVC(), train_x, train_y, valid_x))

SCV model Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.84      0.86      2511
           1       0.85      0.88      0.86      2489

   micro avg       0.86      0.86      0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [14]:
# A simple linear model (Logistic Regression), slightly better performance than SVC, trains much faster

print("Linear model Classification Report: \n", train_model(linear_model.LogisticRegression(), train_x, train_y, valid_x))

Linear model Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.87      0.88      2511
           1       0.87      0.88      0.88      2489

   micro avg       0.88      0.88      0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [15]:
# Random Forest Classifier model, poor performance compared to previos models

from sklearn import ensemble

print("Random Forest Model Performance")
print(train_model(ensemble.RandomForestClassifier(), train_x, train_y, valid_x))

Random Forest Model Performance
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      2511
           1       0.85      0.76      0.81      2489

   micro avg       0.82      0.82      0.82      5000
   macro avg       0.82      0.82      0.82      5000
weighted avg       0.82      0.82      0.82      5000



In [16]:
# Reading in the test set from kaggle:
# The test dataset is not labeled as your predictions are scored on the site itself

test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [17]:
# We'll have to transform the text data into sentence level vecor representations for our analysis:

vecs = []

for line in test.question_text:
    vecs.append(nlp(line).vector)

In [18]:
dat_test = pd.DataFrame(vecs, columns=[i for i in range(1,301)])
dat_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,-0.180312,0.158483,-0.123888,-0.053095,0.060369,0.086867,0.145916,-0.085159,-0.049321,2.479782,...,-0.330796,0.021171,-0.02129,-0.137582,0.116073,0.222279,-0.011756,-0.070077,0.079446,0.012719
1,0.018303,0.129046,-0.141545,-0.165447,0.054936,-0.045345,0.058864,-0.074702,0.026421,2.047348,...,-0.100581,0.005751,-0.007119,-0.153016,0.078141,-0.048296,-0.090879,0.054621,0.019079,0.13597
2,0.044226,0.155399,-0.20287,0.038087,0.168708,-0.174667,-0.03628,-0.289326,-0.063747,2.461891,...,0.064525,0.040552,-0.01975,-0.076734,0.157764,0.007358,-0.097305,-0.068187,0.159205,0.140456
3,-0.003719,0.181323,-0.100883,-0.31207,0.103688,-0.12833,0.40813,-0.157331,0.188684,2.619975,...,-0.412897,-0.087993,0.0056,-0.153037,0.140078,0.122843,-0.051647,-0.163364,0.180401,-0.111823
4,-0.241352,0.238039,-0.11883,-0.215043,0.084284,-0.032048,0.148538,-0.276516,-0.002604,2.493613,...,-0.23659,-0.084084,-0.13621,-0.119996,0.20375,0.21073,-0.066195,-0.169718,-0.012377,0.209562


In [19]:
# Getting the predictions on the test set
# Given this was a kernel only competition, we can't score the prediction here, bummer!!

clf = linear_model.LogisticRegression()
clf.fit(train_x, train_y)
pred_test = clf.predict(dat_test)
pred_test

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)