In [10]:
import os, glob


import random
import sys
import pickle

import pandas as pd

from gensim.corpora.dictionary import Dictionary

# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import GRU, LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint

from nltk import word_tokenize

In [2]:
train_dfs = []
traindir = 'CWI 2018 Training Set/'
for lang in os.listdir(traindir):
    for filename in glob.iglob(traindir+lang+'/*.tsv', recursive=True):
        df = pd.read_csv(filename, header=None, sep='\t')
        df = df.rename(columns={0:'HIT_id', 1:'sentence', 2:'start', 3:'end',
                               4:'target_word', 5:'native_all', 6:'non_native_all', 
                               7:'native_score', 8:'non_native_score',
                               9:'binary_label', 10:'prob_label'})
        df['filename'] = filename
        df['lang'] = lang
        train_dfs.append(df)
        
train_df = pd.concat(train_dfs)

In [3]:
train_df['target_word_lower'] = train_df['target_word'].astype(str).apply(str.lower)
train_df['sentence_lower'] = train_df['sentence'].astype(str).apply(str.lower).apply(word_tokenize).apply(lambda x: ' '.join(x))

train_df['doc'] =  train_df['target_word_lower'] + ' <s> '  + train_df['sentence_lower']
train_df['lang_doc'] =  train_df['lang'] + ' <l> ' + train_df['doc']

train_doc = train_df['lang_doc'].apply(str.split)

In [4]:
test_dfs = {}
testdir = 'CWI 2018 Test Set/'
for lang in os.listdir(testdir):
    for filename in glob.iglob(testdir+lang+'/*.tsv', recursive=True):
        df = pd.read_csv(filename, header=None, sep='\t')
        df = df.rename(columns={0:'HIT_id', 1:'sentence', 2:'start', 3:'end',
                               4:'target_word', 5:'native_all', 6:'non_native_all', 
                               7:'native_score', 8:'non_native_score',
                               9:'binary_label', 10:'prob_label'})
        df['filename'] = filename
        df['lang'] = lang
        test_dfs[lang] = df

In [5]:
test_docs = {}
for lang in test_dfs.keys():
    test_dfs[lang]['target_word_lower'] = test_dfs[lang]['target_word'].astype(str).apply(str.lower)
    test_dfs[lang]['sentence_lower'] = test_dfs[lang]['sentence'].astype(str).apply(str.lower).apply(word_tokenize).apply(lambda x: ' '.join(x))
    test_dfs[lang]['doc'] =  test_dfs[lang]['target_word_lower'] + ' <s> '  + test_dfs[lang]['sentence_lower']
    test_dfs[lang]['lang_doc'] =  test_dfs[lang]['lang'] + ' <l> ' + test_dfs[lang]['doc']
    test_docs[lang] = test_dfs[lang]['lang_doc'].apply(str.split)

In [6]:
vocab = Dictionary(train_doc)

def vectorize_sent(sent):
    return vocab.doc2idx(sent)

X_train = sequence.pad_sequences(train_doc.apply(vectorize_sent), maxlen=150)
y_train = train_df['binary_label']

X_test_en= sequence.pad_sequences(test_docs['english'].apply(vectorize_sent), maxlen=150)
X_test_es = sequence.pad_sequences(test_docs['spanish'].apply(vectorize_sent), maxlen=150)
X_test_de = sequence.pad_sequences(test_docs['german'].apply(vectorize_sent), maxlen=150)


In [8]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(len(vocab.keys()), 32, input_length=150))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 32)           657984    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 150, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 75, 32)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 714,389
Trainable params: 714,389
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
filepath="{epoch:02d}-{acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# Fit the mode

In [None]:
model.fit(X_train, y_train, epochs=3000, batch_size=3000, callbacks=callbacks_list)

Epoch 1/3000

Epoch 00001: acc improved from -inf to 0.59118, saving model to 01-0.5912.hdf5
Epoch 2/3000

Epoch 00002: acc improved from 0.59118 to 0.60085, saving model to 02-0.6008.hdf5
Epoch 3/3000

Epoch 00003: acc improved from 0.60085 to 0.63387, saving model to 03-0.6339.hdf5
Epoch 4/3000

Epoch 00004: acc improved from 0.63387 to 0.64002, saving model to 04-0.6400.hdf5
Epoch 5/3000

Epoch 00005: acc improved from 0.64002 to 0.64233, saving model to 05-0.6423.hdf5
Epoch 6/3000

Epoch 00006: acc improved from 0.64233 to 0.65819, saving model to 06-0.6582.hdf5
Epoch 7/3000

Epoch 00007: acc improved from 0.65819 to 0.68318, saving model to 07-0.6832.hdf5
Epoch 8/3000

Epoch 00008: acc improved from 0.68318 to 0.73552, saving model to 08-0.7355.hdf5
Epoch 9/3000

Epoch 00009: acc improved from 0.73552 to 0.81281, saving model to 09-0.8128.hdf5
Epoch 10/3000

Epoch 00010: acc improved from 0.81281 to 0.86358, saving model to 10-0.8636.hdf5
Epoch 11/3000

Epoch 00011: acc improved f


Epoch 00046: acc did not improve
Epoch 47/3000

Epoch 00047: acc did not improve
Epoch 48/3000

Epoch 00048: acc improved from 0.97616 to 0.97677, saving model to 48-0.9768.hdf5
Epoch 49/3000

Epoch 00049: acc improved from 0.97677 to 0.97722, saving model to 49-0.9772.hdf5
Epoch 50/3000

Epoch 00050: acc did not improve
Epoch 51/3000

Epoch 00051: acc did not improve
Epoch 52/3000

Epoch 00052: acc did not improve
Epoch 53/3000

Epoch 00053: acc did not improve
Epoch 54/3000

Epoch 00054: acc did not improve
Epoch 55/3000

Epoch 00055: acc improved from 0.97722 to 0.97767, saving model to 55-0.9777.hdf5
Epoch 56/3000

Epoch 00056: acc improved from 0.97767 to 0.97896, saving model to 56-0.9790.hdf5
Epoch 57/3000

Epoch 00057: acc improved from 0.97896 to 0.98045, saving model to 57-0.9805.hdf5
Epoch 58/3000

Epoch 00058: acc did not improve
Epoch 59/3000

Epoch 00059: acc did not improve
Epoch 60/3000

Epoch 00060: acc improved from 0.98045 to 0.98051, saving model to 60-0.9805.hdf5



Epoch 00099: acc improved from 0.98532 to 0.98555, saving model to 99-0.9856.hdf5
Epoch 100/3000

Epoch 00100: acc did not improve
Epoch 101/3000

Epoch 00101: acc did not improve
Epoch 102/3000

Epoch 00102: acc did not improve
Epoch 103/3000

Epoch 00103: acc did not improve
Epoch 104/3000

Epoch 00104: acc did not improve
Epoch 105/3000

Epoch 00105: acc did not improve
Epoch 106/3000

Epoch 00106: acc improved from 0.98555 to 0.98604, saving model to 106-0.9860.hdf5
Epoch 107/3000

Epoch 00107: acc did not improve
Epoch 108/3000

Epoch 00108: acc did not improve
Epoch 109/3000

Epoch 00109: acc did not improve
Epoch 110/3000

Epoch 00110: acc did not improve
Epoch 111/3000

Epoch 00111: acc did not improve
Epoch 112/3000

Epoch 00112: acc did not improve
Epoch 113/3000

Epoch 00113: acc did not improve
Epoch 114/3000

Epoch 00114: acc improved from 0.98604 to 0.98610, saving model to 114-0.9861.hdf5
Epoch 115/3000

Epoch 00115: acc did not improve
Epoch 116/3000

Epoch 00116: acc 


Epoch 00154: acc did not improve
Epoch 155/3000

Epoch 00155: acc did not improve
Epoch 156/3000

Epoch 00156: acc did not improve
Epoch 157/3000

Epoch 00157: acc did not improve
Epoch 158/3000

Epoch 00158: acc did not improve
Epoch 159/3000

Epoch 00159: acc did not improve
Epoch 160/3000

Epoch 00160: acc improved from 0.98865 to 0.98867, saving model to 160-0.9887.hdf5
Epoch 161/3000

Epoch 00161: acc improved from 0.98867 to 0.98891, saving model to 161-0.9889.hdf5
Epoch 162/3000

Epoch 00162: acc did not improve
Epoch 163/3000

Epoch 00163: acc improved from 0.98891 to 0.98897, saving model to 163-0.9890.hdf5
Epoch 164/3000

Epoch 00164: acc improved from 0.98897 to 0.98916, saving model to 164-0.9892.hdf5
Epoch 165/3000

Epoch 00165: acc improved from 0.98916 to 0.98931, saving model to 165-0.9893.hdf5
Epoch 166/3000

Epoch 00166: acc did not improve
Epoch 167/3000

Epoch 00167: acc improved from 0.98931 to 0.98933, saving model to 167-0.9893.hdf5
Epoch 168/3000

Epoch 00168: 


Epoch 00208: acc did not improve
Epoch 209/3000

Epoch 00209: acc did not improve
Epoch 210/3000

Epoch 00210: acc did not improve
Epoch 211/3000

Epoch 00211: acc did not improve
Epoch 212/3000

Epoch 00212: acc did not improve
Epoch 213/3000

Epoch 00213: acc did not improve
Epoch 214/3000

Epoch 00214: acc did not improve
Epoch 215/3000

Epoch 00215: acc did not improve
Epoch 216/3000

Epoch 00216: acc did not improve
Epoch 217/3000

Epoch 00217: acc did not improve
Epoch 218/3000

Epoch 00218: acc did not improve
Epoch 219/3000

Epoch 00219: acc did not improve
Epoch 220/3000

Epoch 00220: acc did not improve
Epoch 221/3000

Epoch 00221: acc did not improve
Epoch 222/3000

Epoch 00222: acc did not improve
Epoch 223/3000

Epoch 00223: acc improved from 0.99048 to 0.99067, saving model to 223-0.9907.hdf5
Epoch 224/3000

Epoch 00224: acc did not improve
Epoch 225/3000

Epoch 00225: acc did not improve
Epoch 226/3000

Epoch 00226: acc did not improve
Epoch 227/3000

Epoch 00227: acc d

