### Vectorizing questions after basic pre-processing and dropping stop-words. 
#### While we lose some information, our sentence vectors are not "polluted" by words that do not distinguish one sentence from the other, as stop words are words like: is, the, are, as...and so on. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

import spacy
nlp=spacy.load('en_core_web_lg', disable=['ner', 'parser', 'tagger'])

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm, metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in the training dataset

data = pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
# Checking the distribution of sincere and insincere questions in the training data
data.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

### Taking samples of equal size for sincere and insincere questions for training our model  

In [4]:
# Taking a sample of 10,000 sincere question and 10,000 insincere questions:
# This achieves two goals, oversampling minority class, and faster execution/training of the model

sincere = data[data.target==0].sample(5000)
insincere = data[data.target==1].sample(5000)

In [5]:
# Combining both sincere and insincere questions into a dataframe, we'll train and test our model on this dataframe object.

df = pd.concat((sincere, insincere), axis=0)
print(df.shape)
print(df.target.value_counts())
print(df.isnull().sum())

df.head()

(10000, 3)
1    5000
0    5000
Name: target, dtype: int64
qid              0
question_text    0
target           0
dtype: int64


Unnamed: 0,qid,question_text,target
1127099,dce0c533708a17ef8486,How do I invest for 1 lac annual income after ...,0
984570,c0e71d985fb3c5e35b12,What should be the good name of workshop for e...,0
80267,0fb7e03955c88338e183,Is it possible to get marks of non qualified s...,0
639706,7d4cacbb21d03aab7f76,What can be my rank in Neet 2017 if I get 590 ...,0
789619,9ab545ccab2857c1021e,There is a kid in my class that is super extro...,0


In [6]:
# Insincere questions: 
df[df.target==1].sample(5)

Unnamed: 0,qid,question_text,target
797362,9c40f092eedf6299bfac,Why is Russia always siding with the evil dict...,1
401797,4eb8acad5f0151797184,Are girls ready to accept anything to get fame?,1
197231,268eae52eeec93c6d305,Do conservatives realise that it's 2017 and no...,1
949732,ba1a53845414408c1a8f,Did Narendra Modi do 9/11?,1
281834,372b221f760132c16423,Why isn’t more being done to save the guinea w...,1


In [7]:
# Stopword removal: I'm using the stopword list from NLTK libraries' corpus

import nltk
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [8]:
# Defining a function to preprocess text (lower-casing, dropping punctuations, digits and extra white-spaces)
# While punctuations and digits do add a lot of meaning, when we are vectorizing whole sentences, we want only those words
# that distinguish these sentences from others. In that context, adding these to the whole sentence don't make much sense.

def preprocess(text):
    text = text.lower()
    regex = re.compile(r'[%s%s]' % (string.punctuation, string.digits))
    text = regex.sub(' ', text) 
    regex = re.compile(r'[\s+]')
    text = regex.sub(' ', text)
    text = ' '.join([w for w in text.split()]) 
    return text

In [9]:
df.shape

(10000, 3)

In [10]:
txt_pp = [preprocess(text) for text in df.question_text]

In [11]:
txt_pp[:5]

['how do i invest for lac annual income after years',
 'what should be the good name of workshop for embedded systems',
 'is it possible to get marks of non qualified students for any upsc exam held in',
 'what can be my rank in neet if i get or marks',
 'there is a kid in my class that is super extroverted and every time he takes a whole minutes to answer a question i stare at him amazed is that weird i m am an introvert']

In [12]:
sw_rem= []
for i in txt_pp:
    sw_rem.append(" ".join([word for word in i.split() if word not in stop]))

In [13]:
sw_rem[:5]

['invest lac annual income years',
 'good name workshop embedded systems',
 'possible get marks non qualified students upsc exam held',
 'rank neet get marks',
 'kid class super extroverted every time takes whole minutes answer question stare amazed weird introvert']

In [14]:
df['question_text'] = sw_rem

In [15]:
df['question_text'].head(5)

1127099                       invest lac annual income years
984570                   good name workshop embedded systems
80267      possible get marks non qualified students upsc...
639706                                   rank neet get marks
789619     kid class super extroverted every time takes w...
Name: question_text, dtype: object

In [16]:
vectors=[]

for line in df.question_text:
    vectors.append(nlp(line).vector)

In [17]:
dat = pd.DataFrame(vectors, columns = [i for i in range(1,301)])
dat['target'] = list(df.target)

In [18]:
dat.target.value_counts()

1    5000
0    5000
Name: target, dtype: int64

In [19]:
# Splitting the data into training and validation sets

from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(dat.drop('target', axis=1), dat['target'], test_size=0.25)

In [20]:
# Function to fit & predict data, and to return the classification report 
# The metric of interest here is Recall on Insincere Questions (How many of insincere questions are actually being identified?)

def train_model(classifier, train_features, train_target, valid_features):
    classifier.fit(train_features, train_target)
    predictions = classifier.predict(valid_features)
    return metrics.classification_report(valid_y, predictions)    

In [21]:
# Sanity Check

print(train_x.shape)
print(valid_x.shape)
print(train_y.shape)
print(valid_y.shape)
print()
train_y.value_counts()

(7500, 300)
(2500, 300)
(7500,)
(2500,)



1    3776
0    3724
Name: target, dtype: int64

In [22]:
# A simple linear model (Logistic Regression), slightly better performance than SVC, trains much faster

print("Linear model Classification Report: \n", train_model(linear_model.LogisticRegression(), train_x, train_y, valid_x))

Linear model Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.81      0.82      1276
           1       0.81      0.85      0.83      1224

   micro avg       0.83      0.83      0.83      2500
   macro avg       0.83      0.83      0.83      2500
weighted avg       0.83      0.83      0.83      2500



In [23]:
# SVC (Support Vector Classifier) model - takes long time to train
# We see the SVC model is doing better than Linear model 
# The models here are doing better than models trained on unprocessed vectorized texts

print("SCV model Classification Report: \n", train_model(svm.SVC(), train_x, train_y, valid_x))

SCV model Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.80      0.84      1276
           1       0.81      0.88      0.85      1224

   micro avg       0.84      0.84      0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.85      0.84      0.84      2500

