In [24]:
import pandas as pd
import numpy as np
import re
from gensim.models import KeyedVectors
from sklearn.model_selection import StratifiedShuffleSplit
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [25]:
training_dataset = pd.read_csv('dataset/train.csv')
testing_dataset = pd.read_csv('dataset/test.csv')

print('train dataset shape: ', training_dataset.shape)
print('train dataset shape: ', testing_dataset.shape)
print('train columns: ', training_dataset.columns)
print('test columns: ', testing_dataset.columns)

train dataset shape:  (9349, 3)
train dataset shape:  (493, 2)
train columns:  Index(['gold_label', 'sentence1', 'sentence2'], dtype='object')
test columns:  Index(['sentence1', 'sentence2'], dtype='object')


In [26]:
# any cleaning needed?
training_dataset.sentence1.str.contains(r'[^A-Za-z\-]').any()

True

In [27]:
def clean_sents(sentence):
    return re.sub('[^A-Za-z\-]+', ' ', str(sentence)).replace("'", '').lower()

In [28]:
train_df = training_dataset.copy()

In [29]:
train_df.sentence1 = train_df.sentence1.apply(clean_sents)
train_df.sentence2 = train_df.sentence2.apply(clean_sents)

In [30]:
training_dataset.iloc[29]

gold_label                              entailment
sentence1     A woman is cleaning an outdoor pool.
sentence2                 There is a pool outside.
Name: 29, dtype: object

In [31]:
train_df.iloc[29]

gold_label                              entailment
sentence1     a woman is cleaning an outdoor pool 
sentence2                 there is a pool outside 
Name: 29, dtype: object

In [32]:
train_df.gold_label.unique()

array(['contradiction', 'entailment', 'neutral'], dtype=object)

In [34]:
gold_label_encoded, gold_label_categories = train_df.gold_label.factorize()
gold_label_encoded[:10]

array([0, 1, 2, 2, 1, 0, 2, 1, 0, 1], dtype=int64)

In [35]:
# convert y labels to one hot encodings
# from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import OneHotEncoder

en = OneHotEncoder()
train_df['y'] = en.fit_transform(gold_label_encoded.reshape(1,-1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


ValueError: Length of values does not match length of index

In [None]:
# # few labels, so label encoding directly
# train_df.gold_label = train_df.gold_label.map({'contradiction':0, 'entailment':1, 'neutral':2})

In [None]:
max_vocab_size = 100000

w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors/GoogleNews-vectors-negative300.bin', 
                                              binary=True, limit=max_vocab_size)

In [None]:
# w2v_model['man']

In [None]:
embeddings = w2v_model.vectors[:max_vocab_size, :]
embeddings = np.concatenate((np.zeros((1,300)), embeddings))
embeddings.shape

In [None]:
word2index = {word: i+1 for i, word in enumerate(w2v_model.index2word) if i < max_vocab_size}

In [None]:
print('word index: {}'.format(word2index['man']))
print('word vector: ', embeddings[word2index['man']])

In [None]:
# data preparation
def words_to_embeddings(sentence, word2index):
    return np.array([word2index[wrd] if wrd in word2index else 0 for wrd in sentence.split(' ')])

In [None]:
train_df['x1'] = train_df.sentence1.apply(lambda x: words_to_embeddings(x, word2index))
train_df['x2'] = train_df.sentence2.apply(lambda x: words_to_embeddings(x, word2index))

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=29)

for train_index, val_index in sss.split(train_df.drop(['sentence1','sentence2'],axis=1), train_df['gold_label']):
    train_set = train_df.drop(['sentence1','sentence2'],axis=1).loc[train_index]
    val_set = train_df.drop(['sentence1','sentence2'],axis=1).loc[val_index]

In [None]:
train_set.rename(columns={'gold_label':'y'}, inplace=True)

In [None]:
max(train_set.x1.map(len)), max(train_set.x2.map(len))

In [None]:
max_seq_len = max( max(train_set.x1.map(len)), max(train_set.x2.map(len)) )

x1_padded = pad_sequences(train_set.x1, maxlen=max_seq_len)
x2_padded = pad_sequences(train_set.x2, maxlen=max_seq_len)

In [None]:
train_set.iloc[0]

In [None]:
x1_padded[0]

In [None]:
x2_padded[0]

In [None]:
# looking at the actual sentences by the row index in train_set
train_df.iloc[3796]

In [None]:
y = train_set.y.values
y.shape

In [None]:
print(x1_padded.shape)
print(x2_padded.shape)

**attempt 1**

In [None]:
# from sklearn.linear_model import SGDClassifier

# sgd_clf = SGDClassifier(random_state=29)
# sgd_clf.fit(np.c_[x1_padded, x2_padded], y)

In [None]:
val_x1_padded = pad_sequences(val_set.x1, maxlen=max_seq_len)
val_x2_padded = pad_sequences(val_set.x2, maxlen=max_seq_len)
val_y = val_set.gold_label.values
print(val_y.shape)

# y_pred_val = sgd_clf.predict(np.c_[val_x1_padded, val_x2_padded])

In [None]:
# confusion_matrix(val_y, y_pred_val)

In [None]:
# precision_score(val_y, y_pred_val, average='macro')

In [None]:
# recall_score(val_y, y_pred_val, average='macro')

In [None]:
# f1_score(val_y, y_pred_val, average='macro')

**attempt 2**

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Embedding, LSTM, GRU

In [None]:
model = Sequential()
model.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2)) # vocab_size = embedding.shape[0]; embedding_dim = embeddings.shape[1]
model.add(GRU(units=32, dropout=(0.2), recurrent_dropout=(0.2)))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# try other optimizers as well
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
X_train = np.c_[x1_padded, x2_padded]
X_val = np.c_[val_x1_padded, val_x2_padded]
y_train = y
y_val = val_y

In [None]:
model.fit(X_train, y_train, batch_size=51, epochs=15, validation_data=(X_val, y_val), verbose=2)

**attempt 3**

In [None]:
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Flatten

In [None]:
model = Sequential()
model.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=51, epochs=15, validation_data=(X_val, y_val), verbose=2)

**attempt 4**

In [None]:
from keras.layers import Bidirectional
from keras.optimizers import RMSprop, Adam, SGD, Adagrad

In [None]:
model = Sequential()
model.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=max_seq_len*2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GRU(units=128, dropout=(0.2), recurrent_dropout=(0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))
print(model.summary())

optimizer = Adam(lr=0.001, epsilon=1e-08)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=15, validation_data=(X_val, y_val), verbose=2)