/
ma_lstm_baseline.py
183 lines (140 loc) · 6.22 KB
/
ma_lstm_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import datetime
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
# File paths
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6"
DATA = './corpus/pairs2.csv'
# TEST_CSV = '..../test.csv'
CPP_EMBEDDING_FILE = './baselines/cpp2vec'
JAVA_EMBEDDING_FILE = './baselines/java2vec'
MODEL_SAVING_DIR = '/./baselines/logs/ma_lstm/'
# Load training and test set
df = pd.read_csv(DATA)
train_df, test_df = train_test_split(df, test_size=0.2)
# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>'] # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
cpp_word2vec = KeyedVectors.load(CPP_EMBEDDING_FILE)
java_word2vec = KeyedVectors.load(JAVA_EMBEDDING_FILE)
questions_cols = ['question1', 'question2']
# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
for index, row in dataset.iterrows():
# Iterate through the text of both questions of the row
for question in questions_cols:
if question == "questions1":
word2vec = cpp_word2vec
else:
word2vec = java_word2vec
q2n = []
for word in row[question]:
# Check for unwanted words
if word not in word2vec.wv.vocab:
continue
if word not in vocabulary:
vocabulary[word] = len(inverse_vocabulary)
q2n.append(len(inverse_vocabulary))
inverse_vocabulary.append(word)
else:
q2n.append(vocabulary[word])
dataset.set_value(index, question, q2n)
cpp_embedding_dim = 100
cpp_embeddings = 1 * np.random.randn(len(vocabulary) + 1, cpp_embedding_dim) # This will be the embedding matrix
cpp_embeddings[0] = 0 # So that the padding will be ignored
java_embedding_dim = 100
java_embeddings = 1 * np.random.randn(len(vocabulary) + 1, java_embedding_dim) # This will be the embedding matrix
java_embeddings[0] = 0 # So that the padding will be ignored
# Build the embedding matrix
for word, index in vocabulary.items():
if word in cpp_word2vec.wv.vocab:
cpp_embeddings[index] = cpp_word2vec[word]
for word, index in vocabulary.items():
if word in java_word2vec.wv.vocab:
java_embeddings[index] = java_word2vec[word]
# del word2vec
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
train_df.question2.map(lambda x: len(x)).max(),
test_df.question1.map(lambda x: len(x)).max(),
test_df.question2.map(lambda x: len(x)).max())
# Split to train validation
validation_size = 300
training_size = len(train_df) - validation_size
X = train_df[questions_cols]
Y = train_df['is_duplicate']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)
# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}
# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values
# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)
# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 32
n_epoch = 50
def exponent_neg_manhattan_distance(left, right):
''' Helper function for the similarity estimate of the LSTMs outputs'''
return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')
cpp_embedding_layer = Embedding(len(cpp_embeddings), cpp_embedding_dim, weights=[cpp_embeddings], input_length=max_seq_length, trainable=False)
java_embedding_layer = Embedding(len(java_embeddings), java_embedding_dim, weights=[java_embeddings], input_length=max_seq_length, trainable=False)
# Embedded version of the inputs
encoded_left = cpp_embedding_layer(left_input)
encoded_right = java_embedding_layer(right_input)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)
# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
# Start training
training_start_time = time()
malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
validation_data=([X_validation['left'], X_validation['right']], Y_validation))
print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))
# Plot accuracy
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()