In [1]:
from gensim.models import Word2Vec, KeyedVectors
vecmodel = KeyedVectors.load_word2vec_format('../../../GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)

In [2]:
from __future__ import print_function

import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding, Conv2D, MaxPooling2D,Convolution2D
from keras.models import Model
from keras.layers import Embedding
from keras import backend as K
from scipy.stats.stats import pearsonr   
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from keras.layers.core import Reshape, Flatten
import re
import string

Using TensorFlow backend.


In [3]:
import pandas as pd
columns = "['pair_ID', 'sentence_A', 'sentence_B', 'relatedness_score','entailment_judgment']"
train_df = pd.read_csv("../data/enhance_traindata.csv", sep='\t')
trial_df = pd.read_csv("../data/SemEval2014_dataset/SICK_trial.txt", sep='\t')
test_df = pd.read_csv("../data/SemEval2014_dataset/SICK_test_annotated.txt", sep='\t')

texts = []
BASE_DIR = ''
TEXT_DATA_DIR = os.path.join('../data/SemEval2014_dataset/')
MAX_SEQUENCE_LENGTH = 75
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
# VALIDATION_SPLIT = 0.2
train_df.head()

Unnamed: 0,entailment_judgment,pair_ID,relatedness_score,sentence_A,sentence_B
0,NEUTRAL,1.0,4.5,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...
1,NEUTRAL,2.0,3.2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...
2,ENTAILMENT,3.0,4.7,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...
3,NEUTRAL,5.0,3.4,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...
4,NEUTRAL,9.0,3.7,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...


In [4]:
# create word_id
merged = train_df['sentence_A'].tolist()
merged.extend(train_df['sentence_B'].tolist())
merged.extend(trial_df['sentence_A'].tolist())
merged.extend(trial_df['sentence_B'].tolist())
merged.extend(test_df['sentence_A'].tolist())
merged.extend(test_df['sentence_B'].tolist())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(merged)
sequences = tokenizer.texts_to_sequences(merged)

In [5]:
embeddings_index = {}
word_index = tokenizer.word_index
# prepare embedding matrix
num_words = max(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in vecmodel.vocab:
        embedding_vector = vecmodel[word]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [6]:
def get_tokenized_padded_data(data):
    
#     t = Tokenizer(num_words=MAX1_NB_WORDS)
#     t.fit_on_texts(data)
#     r = RegexpTokenizer(r'\w+')
#     r.tokenize(data)
    padded_data = []
    for d in data:
        r = RegexpTokenizer(r'\w+')
        c = r.tokenize(d)
        seq = tokenizer.texts_to_sequences(c)
        seq = [item for sublist in seq for item in sublist]
        padded_data.append(seq)    
    padded_data = pad_sequences(padded_data, maxlen=MAX_SEQUENCE_LENGTH) 
    return padded_data

from sklearn.preprocessing import LabelEncoder
# get_tokenized_padded_data(['The young "boys" @ are playing outdoors and the man is smiling nearby','Nobody is riding the bicycle on one wheel'])
def get_target_value(data):
    
    target= []
    for i in range(len(data)):
        target.append(int(round(data[i])));
    return target


def get_target_category(data):
    data = get_target_value(data)
    encoder = LabelEncoder()
    class_val = [0,1,2,3,4,5] 
    encoder.fit(class_val)
    encoded_Y = encoder.transform(data)
    # convert integers to dummy variables (i.e. one hot encoded)
    target = keras.utils.to_categorical(encoded_Y)
    return target

def pear_coef(y_true, y_pred):
    pearson_r, update_op = tf.contrib.metrics.streaming_pearson_correlation(y_pred, y_true)
    return pearson_r

def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
   
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

def correlation_coefficient(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = (x - mx), (y-my)
    r_num = K.sum(xm * ym)
    r_den = K.sqrt(K.sum(K.square(xm))) * K.sqrt(K.sum(K.square(ym)))
    r = r_num / r_den
    return r

In [7]:

xA_train = get_tokenized_padded_data(train_df['sentence_A'].tolist())
print(xA_train[0])
xA_val = get_tokenized_padded_data(trial_df['sentence_A'].tolist())
xA_test = get_tokenized_padded_data(test_df['sentence_A'].tolist())

xB_train = get_tokenized_padded_data(train_df['sentence_B'].tolist())
xB_val = get_tokenized_padded_data(trial_df['sentence_B'].tolist())
xB_test = get_tokenized_padded_data(test_df['sentence_B'].tolist())

y_train = get_target_category(train_df['relatedness_score'].tolist())
y_val = get_target_category(trial_df['relatedness_score'].tolist())
y_test = get_target_category(test_df['relatedness_score'].tolist())

# y_train = get_target_value(train_df['relatedness_score'].tolist())
# y_val = get_target_value(trial_df['relatedness_score'].tolist())
# y_test = get_target_value(test_df['relatedness_score'].tolist())


[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   1  87   6 219   3  16   4   1 616   5  25 285  10   3  42
   4   2 402]


In [8]:
print(train_df['sentence_A'][0])
print(train_df['sentence_B'][0])
print(y_train[0])
print(y_train.shape)
print(y_val.shape)

A group of kids is playing in a yard and an old man is standing in the background
A group of boys in a yard is playing and a man is standing in the background
[ 0.  0.  0.  0.  0.  1.]
(15065, 6)
(500, 6)


In [9]:
seqA_input = Input(shape=(MAX_SEQUENCE_LENGTH,),)
seqB_input = Input(shape=(MAX_SEQUENCE_LENGTH,),)

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

x_A = embedding_layer(seqA_input)
x_B = embedding_layer(seqB_input)
x = keras.layers.Concatenate()([x_A, x_B])
reshape = Reshape((2,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM))(x)
x = Convolution2D(filters=EMBEDDING_DIM,kernel_size=(1,1),activation="relu",kernel_initializer='he_uniform')(reshape)
x = MaxPooling2D(pool_size=(1,MAX_SEQUENCE_LENGTH),strides=(1,1))(x)

x_A = Lambda(lambda x: x[:, 0])(x)
x_B = Lambda(lambda x: x[:, 1])(x)

diff = keras.layers.Subtract()([x_A, x_B])
prod = keras.layers.Multiply()([x_A, x_B])


nn = keras.layers.Concatenate()([diff, prod])



nn = Dense(300, activation='tanh',kernel_initializer='he_uniform')(nn)
nn = GlobalMaxPooling1D()(nn)
preds = Dense(6, activation='softmax')(nn)
model = Model(inputs=[seqA_input,seqB_input], outputs=preds)

opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='mean_squared_error',
              optimizer=opt,
              metrics=[correlation_coefficient,'accuracy'])

In [13]:
# model.summary()

In [None]:
model.fit([xA_train,xB_train], y_train,
          batch_size=100,
          epochs=10,validation_data=([xA_val,xB_val], y_val))

Train on 15065 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [12]:
model.evaluate([xA_test,xB_test], y_test, batch_size=339)



[0.11708173517350229, 0.43258842115763213, 0.45747919972537615]