## LSTM Modeling from Scratch

### Import Libraries & Dataset

In [1]:
import numpy as np
import pandas as pd
import warnings
from random import shuffle
import os

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('C:/Users/aksha/Downloads/7610 Final/train.csv')
print (train.shape)
train.head()

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
test = pd.read_csv('C:/Users/aksha/Downloads/7610 Final/test.csv')
print (test.shape)
test.head()

(2345796, 3)


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


### Preprocessing, Stemming and Removing Stopwords

In [4]:
train1 = train.drop(['id', 'qid1', 'qid2'], 1)
test1 = test.drop(['test_id'], 1)

In [5]:
train1 = train1.fillna('')
test1 = test1.fillna('')

In [6]:
import pickle
import nltk
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
from nltk.stem import SnowballStemmer

stop_words = set(stopwords.words('english'))

In [7]:
import re

def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [8]:
train1['question1_modified'] = train1.apply(lambda x: text_to_wordlist(x['question1']), axis = 1)
train1['question2_modified'] = train1.apply(lambda x: text_to_wordlist(x['question2']), axis = 1)
test1['question1_modified'] = test1.apply(lambda x: text_to_wordlist(x['question1']), axis = 1)
test1['question2_modified'] = test1.apply(lambda x: text_to_wordlist(x['question2']), axis = 1)

In [9]:
import pickle

pickle.dump(train1['question1_modified'], open('pickle_train_question1_modified', 'wb'))
pickle.dump(train1['question2_modified'], open('pickle_train_question2_modified', 'wb'))

pickle.dump(test1['question1_modified'], open('pickle_test_question1_modified', 'wb'))
pickle.dump(test1['question2_modified'], open('pickle_test_question2_modified', 'wb'))

### Tokenization of Text

In [10]:
from keras.preprocessing.text import Tokenizer

train1_text = np.hstack([train1.question1_modified, train1.question2_modified])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train1_text)
train1['tokenizer_1'] = tokenizer.texts_to_sequences(train1.question1_modified)
train1['tokenizer_2'] = tokenizer.texts_to_sequences(train1.question2_modified)

In [11]:
test1_text = np.hstack([test1.question1_modified, test1.question2_modified])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test1_text)
test1['tokenizer_1'] = tokenizer.texts_to_sequences(test1.question1_modified)
test1['tokenizer_2'] = tokenizer.texts_to_sequences(test1.question2_modified)

In [12]:
train1['tokenized'] = train1['tokenizer_1'] + train1['tokenizer_2']
test1['tokenized'] = test1['tokenizer_1'] + test1['tokenizer_2']

In [13]:
print (train1['tokenizer_1'][0])
print (train1['tokenizer_2'][0])
print (train1['tokenized'][0])

[1, 1139, 1139, 2494, 496, 673, 302, 9]
[1, 1139, 1139, 2494, 496, 673, 302]
[1, 1139, 1139, 2494, 496, 673, 302, 9, 1, 1139, 1139, 2494, 496, 673, 302]


In [15]:
print (test1['tokenizer_1'][0])
print (test1['tokenizer_2'][0])
print (test1['tokenizer'][0])

[3, 1934, 767, 112, 378, 1566, 767]
[4, 617, 328, 811, 18922, 811, 4184, 175, 1934, 767, 112]
[3, 1934, 767, 112, 378, 1566, 767, 4, 617, 328, 811, 18922, 811, 4184, 175, 1934, 767, 112]


In [17]:
max_length = 50
max_token = np.max([np.max(train1.tokenized.max()),np.max(test1.tokenizer.max())])
print (max_length, max_token)

50 90580


In [18]:
ytr = train1[['is_duplicate']]
Xtr = train1[['tokenized']]
Xte = test1[['tokenizer']]

### Padding & Splitting Dataset

In [20]:
from keras_preprocessing.sequence import pad_sequences

Xtr = pad_sequences(Xtr.tokenized, maxlen = max_length)
Xte = pad_sequences(Xte.tokenizer, maxlen = max_length)

In [25]:
#create mask for train-test distribution
mask = np.random.rand(len(Xtr)) < 0.82
Xtr1 = Xtr[mask]
Xval = Xtr[~mask]

ytr1=ytr[mask]
yval=ytr[~mask]

### Modeling Architecture

In [32]:
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Dropout, LSTM

model_1 = Sequential()
model_1.add(Embedding(max_token+1000, 32))
model_1.add(Dropout(0.3))

model_1.add(LSTM(32))

model_1.add(Dropout(0.3))
model_1.add(Dense(1, activation = 'sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [33]:
model_1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          2930560   
                                                                 
 dropout_2 (Dropout)         (None, None, 32)          0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,938,913
Trainable params: 2,938,913
Non-trainable params: 0
_________________________________________________________________


In [37]:
hist = model_1.fit([Xtr1], ytr1, validation_data = ([Xval], yval), epochs = 16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [38]:
preds = model_1.predict(Xte, batch_size=512)
preds += model_1.predict(Xte, batch_size=512)
preds /= 2

results = pd.DataFrame({'test_id':test.test_id, 'is_duplicate':preds.ravel()})



In [39]:
results.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.221046
1,1,0.327794
2,2,0.859311
3,3,0.069816
4,4,0.36761
