In [1]:
import os, glob


import random
import sys
import pickle

import pandas as pd

from gensim.corpora.dictionary import Dictionary

# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import GRU, LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint
from keras.metrics import mae, categorical_accuracy
from keras.models import load_model

from nltk import word_tokenize


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_dfs = []
traindir = 'CWI 2018 Training Set/'
for lang in os.listdir(traindir):
    for filename in glob.iglob(traindir+lang+'/*_Train.tsv', recursive=True):
        df = pd.read_csv(filename, header=None, sep='\t')
        df = df.rename(columns={0:'HIT_id', 1:'sentence', 2:'start', 3:'end',
                               4:'target_word', 5:'native_all', 6:'non_native_all', 
                               7:'native_score', 8:'non_native_score',
                               9:'binary_label', 10:'prob_label'})
        df['filename'] = filename
        df['lang'] = lang
        train_dfs.append(df)
        
train_df = pd.concat(train_dfs)

In [3]:
train_df['target_word_lower'] = train_df['target_word'].astype(str).apply(str.lower)
train_df['sentence_lower'] = train_df['sentence'].astype(str).apply(str.lower).apply(word_tokenize).apply(lambda x: ' '.join(x))

train_df['doc'] =  train_df['target_word_lower'] + ' <s> '  + train_df['sentence_lower']
train_df['lang_doc'] =  train_df['lang'] + ' <l> ' + train_df['doc']

train_doc = train_df['lang_doc'].apply(str.split)

In [4]:
dev_dfs = []
traindir = 'CWI 2018 Training Set/'
for lang in os.listdir(traindir):
    for filename in glob.iglob(traindir+lang+'/*_Dev.tsv', recursive=True):
        df = pd.read_csv(filename, header=None, sep='\t')
        df = df.rename(columns={0:'HIT_id', 1:'sentence', 2:'start', 3:'end',
                               4:'target_word', 5:'native_all', 6:'non_native_all', 
                               7:'native_score', 8:'non_native_score',
                               9:'binary_label', 10:'prob_label'})
        df['filename'] = filename
        df['lang'] = lang
        dev_dfs.append(df)
        
dev_df = pd.concat(dev_dfs)

In [5]:
dev_df['target_word_lower'] = dev_df['target_word'].astype(str).apply(str.lower)
dev_df['sentence_lower'] = dev_df['sentence'].astype(str).apply(str.lower).apply(word_tokenize).apply(lambda x: ' '.join(x))

dev_df['doc'] =  dev_df['target_word_lower'] + ' <s> '  + dev_df['sentence_lower']
dev_df['lang_doc'] =  dev_df['lang'] + ' <l> ' + dev_df['doc']

dev_doc = dev_df['lang_doc'].apply(str.split)

In [6]:
test_dfs = {}
testdir = 'CWI 2018 Test Set/'
for lang in os.listdir(testdir):
    for filename in glob.iglob(testdir+lang+'/*.tsv', recursive=True):
        df = pd.read_csv(filename, header=None, sep='\t')
        df = df.rename(columns={0:'HIT_id', 1:'sentence', 2:'start', 3:'end',
                               4:'target_word', 5:'native_all', 6:'non_native_all', 
                               7:'native_score', 8:'non_native_score',
                               9:'binary_label', 10:'prob_label'})
        testset, _ = filename.split('_')
        testset = lang + '_' + testset.split('/')[-1]
        df['filename'] = filename
        df['lang'] = lang
        test_dfs[testset] = df

In [7]:
test_dfs.keys()

dict_keys(['german_German', 'spanish_Spanish', 'english_WikiNews', 'french_French', 'english_Wikipedia', 'english_News'])

In [8]:
test_docs = {}
for lang in test_dfs.keys():
    test_dfs[lang]['target_word_lower'] = test_dfs[lang]['target_word'].astype(str).apply(str.lower)
    test_dfs[lang]['sentence_lower'] = test_dfs[lang]['sentence'].astype(str).apply(str.lower).apply(word_tokenize).apply(lambda x: ' '.join(x))
    test_dfs[lang]['doc'] =  test_dfs[lang]['target_word_lower'] + ' <s> '  + test_dfs[lang]['sentence_lower']
    test_dfs[lang]['lang_doc'] =  test_dfs[lang]['lang'] + ' <l> ' + test_dfs[lang]['doc']
    test_docs[lang] = test_dfs[lang]['lang_doc'].apply(str.split)

In [9]:
vocab = Dictionary(train_doc)

def vectorize_sent(sent):
    return vocab.doc2idx(sent)


In [10]:
max_length = 150
X_train = sequence.pad_sequences(train_doc.apply(vectorize_sent), maxlen=max_length)
y_train = train_df['binary_label']

X_dev = sequence.pad_sequences(dev_doc.apply(vectorize_sent), maxlen=max_length)
y_dev = dev_df['binary_label']

X_test_en_news = sequence.pad_sequences(test_docs['english_News'].apply(vectorize_sent), maxlen=max_length)
X_test_en_wikinews = sequence.pad_sequences(test_docs['english_WikiNews'].apply(vectorize_sent), maxlen=max_length)
X_test_en_wiki = sequence.pad_sequences(test_docs['english_Wikipedia'].apply(vectorize_sent), maxlen=max_length)

X_test_es = sequence.pad_sequences(test_docs['spanish_Spanish'].apply(vectorize_sent), maxlen=max_length)
X_test_de = sequence.pad_sequences(test_docs['german_German'].apply(vectorize_sent), maxlen=max_length)


In [11]:
numpy.random.seed(5)
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(len(vocab.keys()), embedding_vecor_length, input_length=150))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
#model.add(Bidirectional(LSTM(100, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 100)          1899900   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 150, 32)           9632      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 75, 32)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 200)           106400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 2,136,433
Trainable params: 2,136,433
Non-trainable params: 0
_________________________________________________________________


In [12]:
filepath="models06/{epoch:02d}-{acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [13]:
model.fit(X_train, y_train, epochs=20, batch_size=3000, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: acc improved from -inf to 0.59038, saving model to models06/01-0.5904.hdf5
Epoch 2/20

Epoch 00002: acc improved from 0.59038 to 0.59174, saving model to models06/02-0.5917.hdf5
Epoch 3/20

Epoch 00003: acc improved from 0.59174 to 0.60587, saving model to models06/03-0.6059.hdf5
Epoch 4/20

Epoch 00004: acc improved from 0.60587 to 0.63903, saving model to models06/04-0.6390.hdf5
Epoch 5/20

Epoch 00005: acc improved from 0.63903 to 0.65890, saving model to models06/05-0.6589.hdf5
Epoch 6/20

Epoch 00006: acc improved from 0.65890 to 0.72568, saving model to models06/06-0.7257.hdf5
Epoch 7/20

Epoch 00007: acc improved from 0.72568 to 0.82481, saving model to models06/07-0.8248.hdf5
Epoch 8/20

Epoch 00008: acc improved from 0.82481 to 0.87684, saving model to models06/08-0.8768.hdf5
Epoch 9/20

Epoch 00009: acc improved from 0.87684 to 0.89629, saving model to models06/09-0.8963.hdf5
Epoch 10/20

Epoch 00010: acc improved from 0.89629 to 0.90820, saving model

<keras.callbacks.History at 0x7f8cdc523fd0>

In [14]:
scores = []
for modelfile in reversed(sorted(os.listdir('models05/'))):
    model = load_model('models05/' + modelfile)
    _, score = model.evaluate(X_dev, y_dev, verbose=1)
    scores.append((score, modelfile))
    print(modelfile, score)

20-0.9296.hdf5 0.7361183639813549
19-0.9263.hdf5 0.7369886858345012
18-0.9262.hdf5 0.7352480419622077
17-0.9232.hdf5 0.7397737162957718
16-0.9217.hdf5 0.7382071366613071
15-0.9190.hdf5 0.7361183638153541
14-0.9163.hdf5 0.7389033942766248
13-0.9131.hdf5 0.7402959095072601
12-0.9111.hdf5 0.7347258487714696
11-0.9038.hdf5 0.7389033942766248
10-0.8976.hdf5 0.7378590080403992
09-0.8760.hdf5 0.7342036555599813
08-0.8251.hdf5 0.7258485641761694
07-0.7386.hdf5 0.6901653612251382
06-0.6734.hdf5 0.6436901654026839
05-0.6442.hdf5 0.5987815492146943
04-0.6271.hdf5 0.5906005222347117
03-0.5939.hdf5 0.5911227154461999
02-0.5917.hdf5 0.5865970409466351
01-0.5907.hdf5 0.5865970409466351


In [15]:
modelfile = '11-0.9038.hdf5'
model = load_model('models05/' + modelfile)

In [32]:
with open('puddlepod.en_news.05.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_news):
        print(int(pred[0] > 0.5), end='\n', file=fout)
        
with open('puddlepod.en_wikinews.05.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_wikinews):
        print(int(pred[0] > 0.5), end='\n', file=fout)
        
with open('puddlepod.en_wiki.05.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_wiki):
        print(int(pred[0] > 0.5), end='\n', file=fout)

with open('puddlepod.es.05.tsv', 'w') as fout:
    for pred in model.predict(X_test_es):
        print(int(pred[0] > 0.5), end='\n', file=fout)
        
with open('puddlepod.de.05.tsv', 'w') as fout:
    for pred in model.predict(X_test_de):
        print(int(pred[0] > 0.5), end='\n', file=fout)

In [16]:
with open('prob_puddlepod.en_news.04.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_news):
        print("{0:.2f}".format(pred[0]), end='\n', file=fout)
        
with open('prob_puddlepod.en_wikinews.04.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_wikinews):
        print("{0:.2f}".format(pred[0]) , end='\n', file=fout)
        
with open('prob_puddlepod.en_wiki.04.tsv', 'w') as fout:
    for pred in model.predict(X_test_en_wiki):
        print("{0:.2f}".format(pred[0]) , end='\n', file=fout)
        
with open('prob_puddlepod.es.04.tsv', 'w') as fout:
    for pred in model.predict(X_test_es):
        print("{0:.2f}".format(pred[0]), end='\n', file=fout)
        
with open('prob_puddlepod.de.04.tsv', 'w') as fout:
    for pred in model.predict(X_test_de):
        print("{0:.2f}".format(pred[0]), end='\n', file=fout)


