In [24]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize

import string

import re
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split 
from gensim.models import KeyedVectors
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

In [4]:
# Load dataframe
df = pd.read_csv('input/train.csv')
# Drop columns
df = df.drop(['id','qid1','qid2'],axis=1)

In [5]:
# check for null values
df['question1'].isnull().sum()
# Question 1 doesnt have any null values


df['question2'].isnull().sum()
# Question 2 has null values

df[df['question2'].isnull()==True]
# Show question 2 null values

# Fill question 2 null values with space
df['question2'] = df['question2'].fillna(' ')

In [6]:
# Clean string
punctuation = set(string.punctuation)

def clean_text (string, clean_SW=False):
    
    string = string.replace('-',' ') ## break words with "-"
    
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', string) ## removes web/html notation
    
    cleantext = cleantext.replace('\n',' ') ## removes skip lines
    
    cleantext = cleantext.replace('  ',' ').replace('  ',' ').replace('  ',' ') ## removes extra spaces between words
       
    cleantext = cleantext.strip() ## removes extra spaces in the end/beggining of words
       
    cleantext = ''.join(ch for ch in cleantext if ch not in punctuation) ## removes punctuation
    
    
    
    return (cleantext)

In [7]:
df['question1_modified'] = df['question1'].map(lambda x: clean_text(x)) ## takes a while
df['question2_modified'] = df['question2'].map(lambda x: clean_text(x)) ## takes a while

In [8]:
# Stop words removal
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

df['question1_modified'] = df['question1'].apply(lambda x: ' '.join([item for item in x.lower().split() if item not in stop]))
df['question2_modified'] = df['question2'].apply(lambda x: ' '.join([item for item in x.lower().split() if item not in stop]))
print df['question1_modified'].head(2)
print df['question1_modified'].head(2)

0    step step guide invest share market india?
1          story kohinoor (koh-i-noor) diamond?
Name: question1_modified, dtype: object
0    step step guide invest share market india?
1          story kohinoor (koh-i-noor) diamond?
Name: question1_modified, dtype: object


In [9]:
# Stem words
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
df["question1_modified"] = df["question1_modified"].apply(lambda x: ''.join([stemmer.stem(y) for y in x]))
df["question2_modified"] = df["question2_modified"].apply(lambda x: ''.join([stemmer.stem(y) for y in x]))

In [10]:
# Remove punctuation
df["question1_modified"] = df["question1_modified"].apply(lambda x: ''.join([y for y in x if y not in punctuation]))
df["question2_modified"] = df["question2_modified"].apply(lambda x: ''.join([y for y in x if y not in punctuation]))

In [11]:
# Split into X and y
X = df.drop(['is_duplicate'],axis=1)
y = df.is_duplicate

labels = np.array(y)

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
# Question1 and Question2
train_question1 = list(X_train['question1_modified'])
train_question2 = list(X_train['question2_modified'])
test_question1 = list(X_test['question1_modified'])
test_question2 = list(X_test['question2_modified'])

In [14]:
# Tokenizer
all_questions = train_question1 + train_question2 + test_question1 + test_question2
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_questions)
print("Fitting is complete.")

# Tokenize train data
train_question1_word_sequences = tokenizer.texts_to_sequences(train_question1)
print("train_question1 is complete.")
train_question2_word_sequences = tokenizer.texts_to_sequences(train_question2)
print("train_question2 is complete")

# Tokenize test data
test_question1_word_sequences = tokenizer.texts_to_sequences(test_question1)
print("test_question1 is complete.")
test_question2_word_sequences = tokenizer.texts_to_sequences(test_question2)
print("test_question2 is complete.")

Fitting is complete.
train_question1 is complete.
train_question2 is complete
test_question1 is complete.
test_question2 is complete.


In [15]:
# Word Index
word_index = tokenizer.word_index

In [16]:
#print train_question1_word_sequences[0]

# See value for a token
#print tokenizer.word_index.keys()[tokenizer.word_index.values().index(904)]

In [17]:
# Pad the sentences to a max length
max_input_length = 30
train_q1 = pad_sequences(train_question1_word_sequences,maxlen = max_input_length)
train_q2 = pad_sequences(train_question2_word_sequences,maxlen = max_input_length)
test_q1 = pad_sequences(test_question1_word_sequences, maxlen = max_input_length)
test_q2 = pad_sequences(test_question2_word_sequences, maxlen = max_input_length)#,padding = 'post',truncating = 'post')

In [18]:
# Word2Vec
embedding_dim = 300
max_nb_words = 200000
nb_words = min(max_nb_words, len(word_index)) + 1
embedding_file = 'input/GoogleNews-vectors-negative300.bin'

# Create empty embedding matrix
embedding_matrix = np.zeros((nb_words, embedding_dim))


# Load word2vec model from Google news bin file
word2vec = KeyedVectors.load_word2vec_format(embedding_file,binary=True)

# Fill embedding matrix
for word, i in word_index.items():
    if word in word2vec.vocab:
         embedding_matrix[i] = word2vec.word_vec(word)

In [19]:
# Check Word2Vec similarity feature
word2vec.most_similar('trump')

[(u'trumps', 0.7198435068130493),
 (u'trumping', 0.580585241317749),
 (u'supersede', 0.5600422620773315),
 (u'trumped', 0.5497317910194397),
 (u'supercede', 0.5309919118881226),
 (u'prevail', 0.48776334524154663),
 (u'outweigh', 0.4785327613353729),
 (u'trample', 0.4714253544807434),
 (u'overshadow', 0.4701153039932251),
 (u'dictate', 0.46754562854766846)]

In [20]:
# Embedding layer
embedding_layer = Embedding(nb_words,embedding_dim,weights=[embedding_matrix],input_length=max_input_length,trainable=False)

In [20]:
# CNN Architechture
# Model architechture
validation_split = 0.01

sequence_1_input = Input(shape=(max_input_length,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences_1)

x1 = MaxPooling1D(10)(x1)

x1 = Flatten()(x1)

x1 = Dense(64, activation='relu')(x1)

x1 = Dropout(0.2)(x1)


sequence_2_input = Input(shape=(max_input_length,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = Conv1D(128, 3, activation='relu')(embedded_sequences_2)
y1 = MaxPooling1D(10)(y1)
y1 = Flatten()(y1)
y1 = Dense(64, activation='relu')(y1)
y1 = Dropout(0.2)(y1)

merged = concatenate([x1, y1])
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(input=[sequence_1_input,sequence_2_input], output=preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# Run model
model.fit([train_q1,train_q2], y_train, epochs=1,validation_split=validation_split, batch_size=1024, shuffle=True,verbose=0)

# Print summary
model.summary()

# check accuracy on test dataset
loss, accuracy = model.evaluate([test_q1,test_q2], y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))



In [None]:
# Another model architechture with LSTM
num_lstm = np.random.randint(175, 275)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
num_dense = np.random.randint(100, 150)
rate_drop_dense = 0.15 + np.random.rand() * 0.25

sequence_1_input = Input(shape=(max_input_length,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(max_input_length,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation='relu')(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)


model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])


hist = model.fit([train_q1,train_q2], y_train,
        epochs=200, batch_size=2048, shuffle=True, \
        )

Epoch 1/200