## IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from nltk.tokenize import word_tokenize
import csv
import os

## IMPORT DATA

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.shape, test_data.shape

((1306122, 3), (375806, 2))

In [4]:
train_data.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


In [5]:
train_data.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [6]:
test_data.head(10)

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?
5,000101884c19f3515c1a,How do you train a pigeon to send messages?
6,00010f62537781f44a47,What is the currency in Langkawi?
7,00012afbd27452239059,"What is the future for Pandora, can the busine..."
8,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
9,000156468431f09b3cae,How much does a tutor earn in Bangalore?


In [7]:
test_data.columns

Index(['qid', 'question_text'], dtype='object')

### Divide data into train data and validation data 

In [8]:
X_train, y_train = train_data[['qid','question_text']], train_data['target']

##### Train Data

In [9]:
X_train[:5]

Unnamed: 0,qid,question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco..."
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...


In [10]:
y_train[:5]

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

#### Test Data

In [11]:
X_test = test_data[['qid','question_text']]

In [12]:
X_test[:5]

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


## TEXT PREPROCESSING

In [13]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# REGEXs to remove unwanted patterns from the text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [15]:
"""
REMOVE_STOPWORDS truncates stopwrds from the string and returns modified string
    INPUT:
        >> string
    OUTPUT:
        >> Modified string without stopwords
"""
def remove_stopwords(text):
    text = [word for word in text.split() if word not in STOPWORDS]
    text = ' '.join(text)
    return text

In [16]:
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
     # convert all characters in a string to lowercase
    text = text.lower()
    
    #replace Symbols with a space in string
    text = re.sub(REPLACE_BY_SPACE_RE, " ",text)
    
    # delete unwanted synbols from string
    text = re.sub(BAD_SYMBOLS_RE,"", text)
    
    # delete stopwords from text
    text = remove_stopwords(text)
    
    return text

In [17]:
# Process training data text
X_train_processed = [text_prepare(text) for text in X_train['question_text']]
X_train_processed[:5]

['quebec nationalists see province nation 1960s',
 'adopted dog would encourage people adopt shop',
 'velocity affect time velocity affect space geometry',
 'otto von guericke used magdeburg hemispheres',
 'convert montra helicon mountain bike changing tyres']

In [18]:
# Process testing data text
X_test_processed = [text_prepare(text) for text in X_test['question_text']]
X_test_processed[9]

'much tutor earn bangalore'

## CALCULATE FREQUENCY OF TOKENS

In [None]:
'''
WORDS_FREQ_COUNTS used to calculate frequncy of words in the text corpus
    >> text to be processed
    << words_freq dictionary of words and their associated frequency
'''
def words_freq_counts(text):
    words_freq = {}

    for line in text:
        for word in line.split():
            if word not in words_freq:
                words_freq[word] = 1
            else:
                words_freq[word] +=1

    return words_freq


In [None]:
# Training data words frequency count
words_count = words_freq_counts(X_train_processed)

In [None]:
# total couunt of words (tokens)
len(words_count)

226515

In [None]:
# most common terms in text corpora
sorted(words_count.items(), key = lambda x: x[1], reverse = True)[:10]

[('get', 62728),
 ('best', 62432),
 ('would', 61374),
 ('people', 55616),
 ('like', 49160),
 ('good', 38389),
 ('one', 33338),
 ('india', 31990),
 ('make', 28243),
 ('think', 26058)]

# BAG OF WORDS APPROACH

In [None]:
DICT_SIZE = 20000
WORDS_TO_INDEX = {b[0]:a for a, b in enumerate(sorted(words_count.items(), key=lambda x:x[1], reverse=True)[:DICT_SIZE])}
INDEX_TO_WORDS = {b: a for a,b in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()


In [None]:
'''
BAG_OF_WORDS_PROCESSING creates a vector of tokens from strings
    >> text to be processed into vector of tokens
    >> words_to_index to refer for the token generation
    >> dict_size default size of all the vectors
    << vector of tokens corrosponding to the given text string
'''
def bag_of_words_processing(text, words_to_index,dict_size):
    # Create a zero vector equaling the size of words list
    tokenized_vector = np.zeros([dict_size])

    for word in set(text.split()):
        if word in words_to_index:
            tokenized_vector[words_to_index[word]] = 1

    return tokenized_vector

In [None]:
# Apply BOW appraoch to train anf test dataset
from scipy import sparse as sp_sparse

##### Train Data

In [None]:
%%time
X_train_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words_processing(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train['question_text']])
print('X-train bow shape:', X_train_bow.shape)

X-train bow shape: (1306122, 20000)
Wall time: 7min 27s


##### Test Data

In [None]:
%%time
X_test_bow = sp_sparse.vstack([sp_sparse.csr_matrix(bag_of_words_processing(text,WORDS_TO_INDEX,DICT_SIZE)) for text in X_test['question_text']])
print('X-test bow shape:', X_test_bow.shape)

X-test bow shape: (375806, 20000)
Wall time: 2min 14s


### ANN model

### Model 1

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Activation, Dropout, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers


Using TensorFlow backend.


In [None]:
max_features = 6000
# tokenizer = Tokenizer(num_words = max_features)
# tokenizer.fit_on_texts(X_train)
# list_tokenized_train = tokenizer.texts_to_sequences(X_train_bow)

max_len = 500
# X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
X_t = X_train_bow
y = y_train

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

batch_size = 100
epochs = 10
history = model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1044897 samples, validate on 261225 samples
Epoch 1/10


In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()