In [1]:
from gensim.models import Word2Vec, KeyedVectors
vecmodel = KeyedVectors.load_word2vec_format('../../../GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)


In [22]:

from __future__ import print_function

import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Conv2D, MaxPooling2D
from keras.models import Model
from keras.layers import Embedding
from keras import backend as K
from scipy.stats.stats import pearsonr   
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer


In [3]:
import pandas as pd
columns = "['pair_ID', 'sentence_A', 'sentence_B', 'relatedness_score','entailment_judgment']"
train_df = pd.read_csv("../data/enhance_traindata.csv", sep='\t')
trial_df = pd.read_csv("../data/SemEval2014_dataset/SICK_trial.txt", sep='\t')
test_df = pd.read_csv("../data/SemEval2014_dataset/SICK_test_annotated.txt", sep='\t')

texts = []
BASE_DIR = ''
TEXT_DATA_DIR = os.path.join('../data/SemEval2014_dataset/')
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
# VALIDATION_SPLIT = 0.2
train_df.head()

Unnamed: 0,entailment_judgment,pair_ID,relatedness_score,sentence_A,sentence_B
0,NEUTRAL,1.0,4.5,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...
1,NEUTRAL,2.0,3.2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...
2,ENTAILMENT,3.0,4.7,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...
3,NEUTRAL,5.0,3.4,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...
4,NEUTRAL,9.0,3.7,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...


In [4]:
# create word_id
merged = train_df['sentence_A'].tolist()
merged.extend(train_df['sentence_B'].tolist())
merged.extend(trial_df['sentence_A'].tolist())
merged.extend(trial_df['sentence_B'].tolist())
merged.extend(test_df['sentence_A'].tolist())
merged.extend(test_df['sentence_B'].tolist())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(merged)
sequences = tokenizer.texts_to_sequences(merged)


In [5]:
embeddings_index = {}
word_index = tokenizer.word_index
# prepare embedding matrix
num_words = max(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in vecmodel.vocab:
        embedding_vector = vecmodel[word]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [6]:
def get_tokenized_padded_data(data):
    
#     t = Tokenizer(num_words=MAX1_NB_WORDS)
#     t.fit_on_texts(data)
#     r = RegexpTokenizer(r'\w+')
#     r.tokenize(data)
    padded_data = []
    for d in data:
        r = RegexpTokenizer(r'\w+')
        c = r.tokenize(d)
        seq = tokenizer.texts_to_sequences(c)
        seq = [item for sublist in seq for item in sublist]
        padded_data.append(seq)    
    padded_data = pad_sequences(padded_data, maxlen=MAX_SEQUENCE_LENGTH) 
    return padded_data

# get_tokenized_padded_data(['The young "boys" @ are playing outdoors and the man is smiling nearby','Nobody is riding the bicycle on one wheel'])

def get_target_category(data):
    
    target= np.zeros([len(data),6])
    for i in range(len(data)):
        value = int(round(data[i]))
        target[i][value] = 1
    return target

def pear_coef(y_true, y_pred):
    pearson_r, update_op = tf.contrib.metrics.streaming_pearson_correlation(y_pred, y_true)
    return pearson_r

def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
   
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

def correlation_coefficient(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = (x - mx), (y-my)
    r_num = K.sum(xm * ym)
    r_den = K.sqrt(K.sum(K.square(xm))) * K.sqrt(K.sum(K.square(ym)))
    r = r_num / r_den
    return r

In [7]:

xA_train = get_tokenized_padded_data(train_df['sentence_A'].tolist())
print(xA_train[0])
xA_val = get_tokenized_padded_data(trial_df['sentence_A'].tolist())
xA_test = get_tokenized_padded_data(test_df['sentence_A'].tolist())

xB_train = get_tokenized_padded_data(train_df['sentence_B'].tolist())
xB_val = get_tokenized_padded_data(trial_df['sentence_B'].tolist())
xB_test = get_tokenized_padded_data(test_df['sentence_B'].tolist())

y_train = get_target_category(train_df['relatedness_score'].tolist())
y_val = get_target_category(trial_df['relatedness_score'].tolist())
y_test = get_target_category(test_df['relatedness_score'].tolist())


[  0   0   0   0   0   0   0   0   0   0   0   0   1  87   6 219   3  16
   4   1 616   5  25 285  10   3  42   4   2 402]


In [8]:
print(xA_train.shape)
y_train.shape

(15065, 30)


(15065, 6)

In [9]:
# inp = Input(shape=(MAX_SEQUENCE_LENGTH,))

In [10]:
# embedding_layer = Embedding(num_words + 1,
#                             EMBEDDING_DIM,
#                             weights=[embedding_matrix],
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=False)

In [11]:
# seqA_input = Input(shape=(MAX_SEQUENCE_LENGTH,))

# embedding_layer = Embedding(num_words,
#                             EMBEDDING_DIM,
#                             weights=[embedding_matrix],
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=False)

# print (seqA_input.shape)
# x_A = embedding_layer(seqA_input)
# print (x_A.shape)
# x_A = Convolution2D(filters=300,
#                          kernel_size=[ 1 , 1 ],
#                          padding="valid",
#                          activation="relu",
#                          strides=1)(x_A)

# print (x_A.shape)
# x_A = MaxPooling2D(pool_size=[MAX_SEQUENCE_LENGTH,1])(x_A)

# # x_B = embedding_layer(sequence_input)
# # x_B = Convolution2D(filters=300,
# #                          kernel_size=[ 1 , 1 ],
# #                          padding="valid",
# #                          activation="relu",
# #                          strides=1)(x_A)

# # x_B = MaxPooling2D(pool_size=[MAX_SEQUENCE_LENGTH,1])(x_B)


# x_A = Dense(300, activation='relu')(x_A)
# preds = Dense(5, activation='softmax')(x_A)
# model = Model(seqA_input, preds)
# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['acc'])
# model.summary()
# # model.fit(xA_train, y_train,
# #           batch_size=128,
# #           epochs=10,
# #           validation_data=(xA_val, y_val))

In [23]:
seqA_input = Input(shape=(MAX_SEQUENCE_LENGTH,),)
seqB_input = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print (seqA_input.shape)
x_A = embedding_layer(seqA_input)
x_A = Conv1D(filters=300,kernel_size=1,padding="valid",activation="relu",strides=1,kernel_initializer='he_uniform')(x_A)
x_A = MaxPooling1D(pool_size=[MAX_SEQUENCE_LENGTH])(x_A)

x_B = embedding_layer(seqB_input)
x_B = Conv1D(filters=300,kernel_size=1,padding="valid",activation="relu",strides=1,kernel_initializer='he_uniform')(x_B)
x_B = MaxPooling1D(pool_size=[MAX_SEQUENCE_LENGTH])(x_B)

diff = keras.layers.Subtract()([x_A, x_B])
prod = keras.layers.Multiply()([x_A, x_B])


x = keras.layers.concatenate([diff, prod])


x = Dense(300, activation='tanh')(x)
x = GlobalMaxPooling1D()(x)
preds = Dense(6, activation='softmax')(x)
model = Model(inputs=[seqA_input,seqB_input], outputs=preds)


keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=[correlation_coefficient])


(?, 30)


In [None]:
model.fit([xA_train,xB_train], y_train,
          batch_size=339,
          epochs=50,validation_data=([xA_val,xB_val], y_val))

Train on 15065 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50

In [None]:
model.evaluate([xA_test,xB_test], y_test, batch_size=339)

In [None]:
# sent_A = get_tokenized_padded_data(np.array(['There is no boy playing outdoors and there is no man smiling']))
# sent_B = get_tokenized_padded_data(np.array(['A group of kids is playing in a yard and an old man is standing in the background']))

sent_A = get_tokenized_padded_data(np.array(['The cat sits on the mat']))
sent_B = get_tokenized_padded_data(np.array(['This models predicts semantic similarity']))

# sent_A = np.reshape(sent_A, [1])
# sent_B = np.reshape(sent_B, [-1,1])
pred = model.predict([sent_A, sent_B])
pred

In [None]:

import matplotlib.pyplot as plt;
import numpy as np
import matplotlib.pyplot as plt

plt.close()

y = pred[0]
N = len(y)
x = [0,1,2,3,4,5]
width = 1
plt.bar(x, y, width, color="blue")

# plt.xticks(x, y)
plt.ylabel('Probability distribution of similarity score')
plt.title('Prediction of simila')
 
plt.show()
