## Insincere question classification using word vectors
### Transforming questions into sentence level vectors (average of all the constituent words) using Spacy

### Import the necessary modules 

In [8]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp=spacy.load('en_core_web_lg', disable=['ner', 'parser', 'tagger'])

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics

import warnings
warnings.filterwarnings('ignore')

In [9]:
# Read the data into a dataframe object

data=pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [10]:
# Check for any null values - no missing values
data.isnull().sum()   

qid              0
question_text    0
target           0
dtype: int64

In [11]:
# Number of questions labeled as sincere(0) and insincere(1)
data.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [12]:
# Seeing a sample of questions labeled as insincere

data[data.target==1].sample(5)

Unnamed: 0,qid,question_text,target
1143093,dffd8a0fbddb0b9403a7,Will the Vietnamese hate American citizens?,1
1000687,c41c84637b1727e2bbe4,Has the USA killed the most people in human hi...,1
20688,040dadb3849ece875fe9,Why do Democrats think Obama didn't trade cash...,1
416414,519c10802ef44316bfe7,Why do Americans ask ridiculous questions abou...,1
368223,482f3c447657b8394f4d,Why does north Indian girls like only fair ski...,1


In [13]:
# Running the spacy's nlp object pipeline over each question and appending the vectors to a list.
# The pretrained vectors on Spacy get assigned to words, and then they are averaged over the entire sentence.
# These are pre-trained Glove vectors with 300 dimensions.

vectors=[]

for i in data['question_text']:
        docs = nlp(i)
        vectors.append(docs.vector)

In [22]:
# Writing the vectors for each question into a dataframe 

dat = pd.DataFrame(vectors, columns=[i for i in range(1,301)])
dat['label'] = data.target

In [30]:
# See what these vectors look like-
dat.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,label
0,0.041162,0.018629,0.014809,-0.051975,0.242775,-0.093926,0.001205,0.021293,-0.029795,2.321309,...,-0.001045,0.115095,-0.110303,0.104766,0.003165,-0.165728,0.005915,0.072043,0.056429,0
1,-0.028734,0.16903,-0.255849,-0.071592,0.05249,-0.021423,0.037019,-0.100158,-0.058501,2.492289,...,0.049579,-0.031219,0.033171,0.036523,0.029245,-0.02535,-0.048717,0.185062,0.199679,0
2,0.030072,0.455726,-0.286497,-0.07848,-0.171664,0.097254,0.0743,0.009398,0.042198,1.754545,...,-0.176183,0.047768,-0.038054,0.104313,-0.000564,-0.168625,0.026291,0.07629,0.192756,0
3,0.001983,0.147829,-0.081579,-0.010717,-0.122963,0.099978,-0.05739,0.064468,-0.046313,1.008592,...,0.154498,0.04146,-0.068187,0.10385,-0.061797,0.09365,-0.152707,0.01244,0.130766,0
4,0.250811,0.183827,-0.272104,-0.163372,0.044434,0.091608,0.028622,-0.08919,-0.29206,1.30661,...,-0.097426,0.069995,-0.114054,0.121243,-0.080533,0.063596,0.141857,0.018182,0.090745,0


In [31]:
from sklearn.model_selection import train_test_split

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(dat.drop('label', axis=1), dat['label'],
                                                                      stratify=dat['label'], test_size=0.25)

In [36]:
# Sanity check on the subdivisions:-

print(train_x.shape)
print(valid_x.shape)
print(train_y.shape)
print(valid_y.shape)
print()
print(train_y.head())
print()
train_x.head()

(979591, 300)
(326531, 300)
(979591,)
(326531,)

961357     0
348428     0
107895     0
1006521    0
495256     0
Name: label, dtype: int64



Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
961357,-0.012526,0.151282,-0.019442,0.072343,0.152731,0.010203,-0.093743,-0.172946,0.010127,2.111773,...,0.014493,0.044571,0.068212,-0.028344,0.05467,-0.003572,-0.089725,-0.083538,-0.030796,0.051912
348428,0.052377,0.031026,-0.221313,-0.169778,0.140475,0.006992,0.239875,-0.045288,0.013833,2.060694,...,-0.219939,-0.137189,-0.286427,-0.014479,0.199957,-0.059859,-0.177608,-0.112644,0.069006,0.152825
107895,0.052031,0.055297,-0.181203,0.076736,-0.05248,-0.025386,0.024256,-0.218329,-0.081806,1.588319,...,-0.087271,-0.052311,0.217408,-0.035886,0.226142,0.016084,-0.010349,-0.11934,-0.012489,0.294948
1006521,0.042653,0.282021,0.031716,-0.097453,0.088628,-0.069155,0.127764,-0.152819,-0.136006,0.564162,...,-0.004947,-0.31696,0.170021,-0.06353,0.189072,-0.063569,0.018466,-0.049215,0.02457,0.222316
495256,-0.073219,0.136933,-0.251136,-0.084212,0.116837,-0.100619,-0.021798,-0.076366,-0.042972,2.434653,...,-0.321898,0.017909,-0.069324,-0.122946,0.070828,-0.079165,-0.082852,-0.197573,0.173644,0.072423


In [37]:
# Fucntion to train models and predicting classes on the validation set

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.classification_report(valid_y, predictions) 

In [40]:
# Simple Linear (Logistic Regression) Model
# Takes a long time to train, be patient
# Doesn't perform nearly well enough as when I tried simple BOW feature extraction on these questions

print(train_model(linear_model.LogisticRegression(), train_x, train_y, valid_x))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97    306328
           1       0.65      0.34      0.45     20203

   micro avg       0.95      0.95      0.95    326531
   macro avg       0.80      0.67      0.71    326531
weighted avg       0.94      0.95      0.94    326531

