## IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from nltk.tokenize import word_tokenize
import csv
import os

## IMPORT DATA

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.shape

(1306122, 3)

In [4]:
data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [5]:
data.columns

Index(['qid', 'question_text', 'target'], dtype='object')

### Divide data into train data and validation data 

In [6]:
# Dividing training dataset into train data (70%) and validation data (30%)
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(data['question_text'], data['target'], test_size=0.3, random_state=1)

##### Train Data

In [8]:
X_train.head()

635414    What will be the in-hand salary after 3rd PRC ...
906079                      Should one take YOLO seriously?
99492                   What are the best clubs in Nairobi?
973656    You wake up as Donald Trump with a clock ticki...
397612                        What made Elon Musk so smart?
Name: question_text, dtype: object

In [10]:
X_train.shape

(914285,)

In [11]:
y_train.head()

635414    0
906079    0
99492     0
973656    0
397612    0
Name: target, dtype: int64

##### Validation Data

In [12]:
X_val.head()

414550    How likely is gay marriage being legalized in ...
925862             Is politics the most corrupt profession?
829278         How do I get a walk up campsite at Yosemite?
604431                   How was Banner Health established?
701714    Do you feel proud or ashamed telling that you ...
Name: question_text, dtype: object

In [16]:
X_val.shape

(391837,)

In [17]:
y_val.head()

414550    0
925862    0
829278    0
604431    0
701714    0
Name: target, dtype: int64

#### Test Data

In [18]:
test_data = pd.read_csv('data/test.csv')

In [19]:
test_data.shape

(375806, 2)

In [24]:
test_data.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [25]:
test_data.columns

Index(['qid', 'question_text'], dtype='object')

In [26]:
X_test = test_data['question_text'].values

In [27]:
X_test[9]

'How much does a tutor earn in Bangalore?'

## TEXT PREPROCESSING

In [28]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# REGEXs to remove unwanted patterns from the text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [30]:
'''
TEXT_PREPROCESSING for removing unwanted stopwords, symbols and modify text 
    >> text to be processed
    << modified text
'''
def text_preprocessing(text_):

    # convert text to lower case
    text_ = text_.lower()

    # replace symbols with a space in the text
    text_ = re.sub(REPLACE_BY_SPACE_RE, " ", text_)

    # truncate unwanted symbols from text
    ext_ = re.sub(BAD_SYMBOLS_RE,"",text_)

    # delete stopwords from text
    text_ = " ".join(word for word in text_.split() if word not in STOPWORDS)

    return text_

In [31]:
# Process training data text
X_train_processed = [text_preprocessing(text) for text in X_train]
X_train_processed[9]

'daenerys last known dragons last known dragonriders?'

In [32]:
# Process validation data text
X_val_processed = [text_preprocessing(text) for text in X_val]
X_val_processed[9]

'william john "bill" nixon\'s major accomplishments footballer?'

In [33]:
# Process testing data text
X_test_processed = [text_preprocessing(text) for text in X_test]
X_test_processed[9]

'much tutor earn bangalore?'

## CALCULATE FREQUENCY OF TOKENS

In [41]:
'''
WORDS_FREQ_COUNTS used to calculate frequncy of words in the text corpus
    >> text to be processed
    << words_freq dictionary of words and their associated frequency
'''
def words_freq_counts(text):
    words_freq = {}

    for line in text:
        for word in line.split():
            if word not in words_freq:
                words_freq[word] = 1
            else:
                words_freq[word] +=1

    return words_freq


In [35]:
# Training data words frequency count
words_count = words_freq_counts(X_train_processed)

In [42]:
# total couunt of words (tokens)
len(words_count)

293792

In [43]:
# most common terms in text corpora
sorted(words_count.items(), key = lambda x: x[1], reverse = True)[:10]

[('get', 43538),
 ('best', 43263),
 ('would', 43012),
 ('people', 34985),
 ('like', 31169),
 ('good', 25569),
 ('one', 21792),
 ('make', 19519),
 ('think', 17852),
 ('many', 17074)]

# BAG OF WORDS APPROACH

In [44]:
DICT_SIZE = 20000
WORDS_TO_INDEX = {b[0]:a for a, b in enumerate(sorted(words_count.items(), key=lambda x:x[1], reverse=True)[:DICT_SIZE])}
INDEX_TO_WORDS = {b: a for a,b in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()


In [45]:
'''
BAG_OF_WORDS_PROCESSING creates a vector of tokens from strings
    >> text to be processed into vector of tokens
    >> words_to_index to refer for the token generation
    >> dict_size default size of all the vectors
    << vector of tokens corrosponding to the given text string
'''
def bag_of_words_processing(text, words_to_index,dict_size):
    # Create a zero vector equaling the size of words list
    tokenized_vector = np.zeros([dict_size])

    for word in set(text.split()):
        if word in words_to_index:
            tokenized_vector[words_to_index[word]] = 1

    return tokenized_vector

In [46]:
# Apply BOW appraoch to train anf test dataset
from scipy import sparse as sp_sparse

##### Train Data

In [47]:
%%time
X_train_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words_processing(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
print('X-train bow shape:', X_train_bow.shape)

X-train bow shape: (914285, 20000)
Wall time: 4min 42s


##### Validation Data

In [48]:
%%time
X_val_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words_processing(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
print('X-Val bow shape:', X_val_bow.shape)

X-Val bow shape: (391837, 20000)
Wall time: 2min 4s


##### Test Data

In [49]:
%%time
X_test_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words_processing(text,WORDS_TO_INDEX,DICT_SIZE)) for text in X_test])
print('X-test bow shape:', X_test_bow.shape)

X-test bow shape: (375806, 20000)
Wall time: 1min 56s


## PREDICTION

In [50]:
# We will be using Stochaistic stochastic gradient descent classifier
from sklearn.linear_model import SGDClassifier

### Bag Of Words Approach

In [51]:
classifier_bow = SGDClassifier(loss='hinge', penalty='l1',n_iter=20)
classifier_bow.fit(X_train_bow, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=20, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [64]:
# predict
y_val_pred_bow = classifier_bow.predict(X_val_bow)
y_test_pred_bow = classifier_bow.predict(X_test_bow)

## EVALUATION

##### Accuracy Scores

In [60]:
from sklearn.metrics import accuracy_score

'''
CALCULATE_ACCURACY_SCORE to calculate accuracy score of our model
    >> targets against which we will be calculating our accuracy score
    >> prediction values rounded off
    << accuracy score in percentage
'''
def calculate_accuracy_score(target, prediction):
    return accuracy_score(target, prediction)*100

###### Bag of Words Approach

In [63]:
print('Bag of Words (BOG) Accuracy Score: ', calculate_accuracy_score(y_val, y_val_pred_bow))

Bag of Words (BOG) Accuracy Score:  93.9885717785712
