In [26]:
import pandas as pd
import numpy as np
import re
from gensim.models import KeyedVectors
from sklearn.model_selection import StratifiedShuffleSplit
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from keras.utils.np_utils import to_categorical

In [91]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Flatten, Dropout
from keras.layers import Bidirectional
from keras.optimizers import RMSprop, Adam, SGD, Adagrad
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
training_dataset = pd.read_csv('dataset/train.csv')
testing_dataset = pd.read_csv('dataset/test.csv')

print('train dataset shape: ', training_dataset.shape)
print('train dataset shape: ', testing_dataset.shape)
print('train columns: ', training_dataset.columns)
print('test columns: ', testing_dataset.columns)

train dataset shape:  (9349, 3)
train dataset shape:  (493, 2)
train columns:  Index(['gold_label', 'sentence1', 'sentence2'], dtype='object')
test columns:  Index(['sentence1', 'sentence2'], dtype='object')


In [4]:
def clean_sents(sentence):
    return re.sub('[^A-Za-z\-]+', ' ', str(sentence)).replace("'", '').lower()

In [5]:
train_df = training_dataset.copy()
train_df.sentence1 = train_df.sentence1.apply(clean_sents)
train_df.sentence2 = train_df.sentence2.apply(clean_sents)

In [7]:
X = train_df.drop(['gold_label'], axis=1)
y = train_df[['gold_label']]

In [15]:
y_encoded, y_categories = y['gold_label'].factorize()
y_encoded[:10]

array([0, 1, 2, 2, 1, 0, 2, 1, 0, 1], dtype=int64)

In [16]:
y_categories

Index(['contradiction', 'entailment', 'neutral'], dtype='object')

In [29]:
y = to_categorical(y_encoded, num_classes=3)

In [65]:
type(y)

numpy.ndarray

In [30]:
max_vocab_size = 100000

w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors/GoogleNews-vectors-negative300.bin', 
                                              binary=True, limit=max_vocab_size)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [31]:
embeddings = w2v_model.vectors[:max_vocab_size, :]
embeddings = np.concatenate((np.zeros((1,300)), embeddings))
embeddings.shape

(100001, 300)

In [32]:
word2index = {word: i+1 for i, word in enumerate(w2v_model.index2word) if i < max_vocab_size}

In [33]:
# print('word index: {}'.format(word2index['man']))
# print('word vector: ', embeddings[word2index['man']])

In [34]:
def words_to_embeddings(sentence, word2index):
    return np.array([word2index[wrd] if wrd in word2index else 0 for wrd in sentence.split(' ')])

X['x1'] = X.sentence1.apply(lambda x: words_to_embeddings(x, word2index))
X['x2'] = X.sentence2.apply(lambda x: words_to_embeddings(x, word2index))

In [66]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=29)

for train_index, test_index in sss.split(X.drop(['sentence1','sentence2'],axis=1), y):
    X_train = X.drop(['sentence1','sentence2'],axis=1).loc[train_index]
    X_test = X.drop(['sentence1','sentence2'],axis=1).loc[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

In [67]:
X_train.sample(2)

Unnamed: 0,x1,x2
5857,"[0, 534, 252, 2362, 0, 988, 5872, 0, 988, 7946...","[73, 20, 308, 3052, 421, 4, 252, 2, 988, 0]"
9147,"[0, 5268, 391, 380, 70, 0, 6562, 0]","[12, 642, 5, 5268, 0]"


In [70]:
y_train[:2]

array([[0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

In [71]:
max(X_train.x1.map(len)), max(X_train.x2.map(len))

(51, 49)

In [72]:
max_seq_len = max( max(X_train.x1.map(len)), max(X_train.x2.map(len)) )

x1_padded = pad_sequences(X_train.x1, maxlen=max_seq_len)
x2_padded = pad_sequences(X_train.x2, maxlen=max_seq_len)

x1_test_padded = pad_sequences(X_test.x1, maxlen=max_seq_len)
x2_test_padded = pad_sequences(X_test.x2, maxlen=max_seq_len)

In [73]:
train_set = np.c_[x1_padded, x2_padded]
test_set = np.c_[x1_test_padded, x2_test_padded]
train_set

array([[    0,     0,     0, ...,  1214,   435,     0],
       [    0,     0,     0, ...,     2,  3127,     0],
       [    0,     0,     0, ...,    13,     0, 13674],
       ...,
       [    0,     0,     0, ...,     5,     0,     0],
       [    0,     0,     0, ...,    28, 36566,     0],
       [    0,     0,     0, ...,     6,    12,  1393]])

In [74]:
test_set

array([[    0,     0,     0, ...,     5,  2265,     0],
       [    0,     0,     0, ..., 11240,  2458,     0],
       [    0,     0,     0, ...,    20,   587,     0],
       ...,
       [    0,     0,     0, ...,   581,  1002,     0],
       [    0,     0,     0, ...,   224,    67,     0],
       [    0,     0,     0, ...,   252,     5,  2026]])

In [78]:
print(X.shape)
print(train_set.shape)
print(test_set.shape)
print()
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(9349, 4)
(7479, 102)
(1870, 102)

(9349, 3)
(7479, 3)
(1870, 3)


In [80]:
model = Sequential()
model.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model.add(GRU(units=32, dropout=(0.2), recurrent_dropout=(0.2)))
# model.add(Dense(3, activation='tanh')) #TODO: rem this, too much complexity
model.add(Dense(3, activation='softmax'))

optimizer = Adam(lr=0.001, epsilon=1e-08)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(train_set, y_train, batch_size=32, epochs=10, validation_data=(test_set, y_test), verbose=2)

Train on 7479 samples, validate on 1870 samples
Epoch 1/10
 - 70s - loss: 0.5960 - acc: 0.6879 - val_loss: 0.5563 - val_acc: 0.7087
Epoch 2/10
 - 69s - loss: 0.4847 - acc: 0.7689 - val_loss: 0.5595 - val_acc: 0.7191
Epoch 3/10
 - 73s - loss: 0.4042 - acc: 0.8155 - val_loss: 0.5990 - val_acc: 0.7157
Epoch 4/10
 - 74s - loss: 0.3511 - acc: 0.8432 - val_loss: 0.6562 - val_acc: 0.7100
Epoch 5/10
 - 74s - loss: 0.3108 - acc: 0.8634 - val_loss: 0.6898 - val_acc: 0.7080
Epoch 6/10
 - 73s - loss: 0.2805 - acc: 0.8765 - val_loss: 0.7463 - val_acc: 0.7064
Epoch 7/10
 - 72s - loss: 0.2563 - acc: 0.8906 - val_loss: 0.7904 - val_acc: 0.7005
Epoch 8/10
 - 70s - loss: 0.2403 - acc: 0.8975 - val_loss: 0.8328 - val_acc: 0.7018
Epoch 9/10
 - 68s - loss: 0.2213 - acc: 0.9044 - val_loss: 0.8869 - val_acc: 0.7018
Epoch 10/10
 - 73s - loss: 0.2067 - acc: 0.9125 - val_loss: 0.9390 - val_acc: 0.6959


<keras.callbacks.History at 0x1357f1a50f0>

In [81]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 102, 300)          30000300  
_________________________________________________________________
gru_3 (GRU)                  (None, 32)                31968     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 99        
Total params: 30,032,367
Trainable params: 30,032,367
Non-trainable params: 0
_________________________________________________________________


In [86]:
model2 = Sequential()
model2.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model2.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(rate=0.2))
model2.add(Flatten())
model2.add(Dense(100, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(3, activation='softmax'))

print(model2.summary())

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print("Training...")
model2.fit(train_set, y_train, batch_size=32, epochs=5, validation_data=(test_set, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 102, 300)          30000300  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 98, 128)           192128    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 49, 128)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 49, 128)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 6272)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 100)               627300    
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
__________

<keras.callbacks.History at 0x13652c92828>

In [88]:
model3 = Sequential()
model3.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model3.add(GRU(units=32, dropout=(0.2), recurrent_dropout=(0.2)))
model3.add(Dropout(rate=0.4))
model3.add(Dense(3, activation='softmax'))
print(model3.summary())

optimizer = Adam(lr=0.001, epsilon=1e-08)
model3.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model3.fit(train_set, y_train, batch_size=32, epochs=6, validation_data=(test_set, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 102, 300)          30000300  
_________________________________________________________________
gru_5 (GRU)                  (None, 32)                31968     
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 99        
Total params: 30,032,367
Trainable params: 30,032,367
Non-trainable params: 0
_________________________________________________________________
None
Train on 7479 samples, validate on 1870 samples
Epoch 1/6
 - 71s - loss: 0.6051 - acc: 0.6832 - val_loss: 0.5587 - val_acc: 0.7178
Epoch 2/6
 - 68s - loss: 0.5006 - acc: 0.7601 - val_loss: 0.5630 - val_acc: 0.7212
Epoch 3/6
 - 70s - loss: 0.4197 - acc: 0.80

<keras.callbacks.History at 0x1376065b748>

In [90]:
model4 = Sequential()
model4.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model4.add(GRU(units=32, dropout=(0.4), recurrent_dropout=(0.4)))
model4.add(Dropout(rate=0.4))
model4.add(Dense(3, activation='softmax'))
print(model4.summary())

optimizer = RMSprop(lr=0.001, epsilon=1e-08) #Adam(lr=0.001, epsilon=1e-08)
model4.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model4.fit(train_set, y_train, batch_size=32, epochs=5, validation_data=(test_set, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 102, 300)          30000300  
_________________________________________________________________
gru_7 (GRU)                  (None, 32)                31968     
_________________________________________________________________
dropout_8 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 99        
Total params: 30,032,367
Trainable params: 30,032,367
Non-trainable params: 0
_________________________________________________________________
None
Train on 7479 samples, validate on 1870 samples
Epoch 1/5
 - 64s - loss: 0.6182 - acc: 0.6730 - val_loss: 0.5849 - val_acc: 0.6982
Epoch 2/5
 - 60s - loss: 0.5563 - acc: 0.7194 - val_loss: 0.5586 - val_acc: 0.7201
Epoch 3/5
 - 56s - loss: 0.5117 - acc: 0.75

<keras.callbacks.History at 0x137668c67f0>

In [93]:
# best so far
model4 = Sequential()
model4.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model4.add(GRU(units=32, dropout=(0.4), recurrent_dropout=(0.4)))
model4.add(Dropout(rate=0.4))
model4.add(Dense(3, activation='softmax'))
print(model4.summary())

optimizer = RMSprop(lr=0.001, epsilon=1e-08) #Adam(lr=0.001, epsilon=1e-08)
model4.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

filepath = "model-epoch-{epoch:02d}-val_acc-{val_acc:.4f}.hdf5"
chk_point = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False)

callbacks_list = [chk_point]

model4.fit(train_set, y_train, batch_size=32, epochs=10, validation_data=(test_set, y_test), callbacks=callbacks_list)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 102, 300)          30000300  
_________________________________________________________________
gru_9 (GRU)                  (None, 32)                31968     
_________________________________________________________________
dropout_10 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 3)                 99        
Total params: 30,032,367
Trainable params: 30,032,367
Non-trainable params: 0
_________________________________________________________________
None
Train on 7479 samples, validate on 1870 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.69661, saving model to model-epoch-01-val_acc-0.6966.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.69661 to 0.71889, saving model to model-epoch

<keras.callbacks.History at 0x1376c95c978>

In [97]:
model5 = Sequential()
model5.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model5.add(Flatten())
model5.add(Dense(500, activation='tanh'))
model5.add(Dropout(rate=0.4))
model5.add(Dense(100, activation='tanh'))
model5.add(Dropout(rate=0.25))
model5.add(Dense(3, activation='softmax'))
print(model5.summary())

optimizer = RMSprop(lr=0.001, epsilon=1e-08) #Adam(lr=0.001, epsilon=1e-08)
model5.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

filepath = "model-5-epoch-{epoch:02d}-val_acc-{val_acc:.4f}.hdf5"
chk_point = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False)
callbacks_list = [chk_point]

model5.fit(train_set, y_train, batch_size=32, epochs=5, 
           validation_data=(test_set, y_test), 
           callbacks=callbacks_list
          )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 102, 300)          30000300  
_________________________________________________________________
flatten_7 (Flatten)          (None, 30600)             0         
_________________________________________________________________
dense_24 (Dense)             (None, 500)               15300500  
_________________________________________________________________
dropout_11 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               50100     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 3)                 303       
Total para

<keras.callbacks.History at 0x13773f40fd0>

In [99]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=29)

rf_clf.fit(train_set, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=29, verbose=0, warm_start=False)

In [100]:
y_pred_val = rf_clf.predict(test_set)

In [102]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("precision: ", precision_score(y_pred_val, y_test, average='macro'))
print("recall: ", recall_score(y_pred_val, y_test, average='macro'))
print("f1: ", f1_score(y_pred_val, y_test, average='macro'))

precision:  0.12858869762398603
recall:  0.33580839402516727
f1:  0.18527246920315146


In [103]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [104]:
rf_random = RandomizedSearchCV(rf_clf, param_distributions = random_grid, 
                               n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [105]:
rf_random.fit(train_set, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 27.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=29, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)