In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 

In [3]:
df = pd.read_csv("nlp dataset/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

In [4]:
df = df.dropna(axis=1)

In [5]:
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [6]:
y = df['domain1_score']

In [7]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [8]:
def word(sentence):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    stop_word = set(stopwords.words("english"))
    words = tokenizer.tokenize(sentence)
    words=[i.lower() for i in words]
    words = [w for w in words if not w in stop_word]
    return words
def sentence(essay):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    s = tokenizer.tokenize(essay)
    fs = []
    for i in s:
        fs.append(word(i))
    return fs
def word_vectors(words,model,dim):
    fv = np.zeros((dim,),dtype='float32')
    i2w_set = set(model.wv.index2word)
    num_words = 0
    for word in words:
        if word in i2w_set:
            num_words+=1
            fv = np.add(fv,model[word])
    fv = np.divide(fv,num_words)
    return fv
def avg_word_vectors(essay,model,dim):
    efv = np.zeros((len(essay),dim), dtype='float32')
    c=0
    for i in essay:
        efv[c] = word_vectors(i,model,dim)
        c+=1
    return efv

In [10]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [17]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
from gensim.models import Word2Vec

sent_list = []
cv = KFold(5, shuffle=True)
count=1
results=[]
for train,test in cv.split(df):
    x_train,x_test,y_train,y_test = df.iloc[train], df.iloc[test], y.iloc[train], y.iloc[test]
    train_set = x_train['essay']
    test_set = x_test['essay']
    for i in train_set:
        sent_list+=sentence(i)
    
    model = Word2Vec(sent_list, workers=4, size=300, min_count = 40, window = 10, sample = 1e-3)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)
    essay_train_words = []
    for i in train_set:
        essay_train_words.append(word(i))
    train_vec = avg_word_vectors(essay_train_words,model,300)
    essay_test_words = []
    for i in test_set:
        essay_test_words.append(word(i))
    test_vec = avg_word_vectors(essay_test_words,model,300)
    train_vec = np.array(train_vec)
    test_vec = np.array(test_vec)
    train_vec = np.reshape(train_vec, (train_vec.shape[0], 1, train_vec.shape[1]))
    test_vec = np.reshape(test_vec, (test_vec.shape[0], 1, test_vec.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(train_vec, y_train, batch_size=64, epochs=50)
    y_pred = lstm_model.predict(test_vec)
    if count == 5:
         lstm_model.save('./model_weights/final_lstm.h5')
    y_pred = np.around(y_pred)
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 1, 300)            721200    
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
E

Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score: 0.9723502412428651
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 1, 300)            721200    
_________________________________________________________________
lstm_10 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_5 (Dropout)      

Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score: 0.9736039501870017
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 1, 300)            721200    
_________________________________________________________________
lstm_12 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 

Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Kappa Score: 0.9682560329141336
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_13 (LSTM)               (None, 1, 300)            721200    
_________________________________________________________________
lstm_14 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable 

Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


OSError: Unable to create file (unable to open file: name = './model_weights/final_lstm.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 302)