In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score

stopwords = set(stopwords.words('english'))

In [2]:
df = pd.read_excel('test.xlsx')
df.head(2)

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",66.666667
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",75.0


In [3]:
num_rows = df.shape[0]
essays = df['essay'].values

#Initialize dataframe columns
df['word_count'] = np.nan 
df['sentence_count'] = np.nan
df['avg_word_length'] = np.nan 
df['num_exclamation_marks'] = np.nan
df['num_question_marks'] = np.nan
df['num_stopwords'] = np.nan

df['noun_count'] = np.nan
df['verb_count'] = np.nan
df['foreign_count'] = np.nan
df['adj_count'] = np.nan
df['conj_count'] = np.nan
df['adv_count'] = np.nan

def get_pos_tags(essay):
    nouns = verbs = foreign = adj = adv = conj = 0
    tokens = nltk.word_tokenize(essay)
    for token in tokens:
        pos_tag = nltk.pos_tag(nltk.word_tokenize(token))
        for (_, tag) in (pos_tag):
            if tag[0] == "N":
                nouns += 1
            elif tag[0] == "V":
                verbs += 1
            elif tag[0:2] == "FW":
                foreign += 1
            elif tag[0] == "J":
                adj += 1
            elif tag[0] == "R":
                adv += 1
            elif tag[0:2] == "CC" or tag[0:2] == "IN":
                conj += 1
    
    return [nouns, verbs, foreign, adj, adv, conj]


for i in range(num_rows):
    
    # Turn essay into list of words
    text = essays[i].split(" ")
    
    # Set word count
    df.set_value(i,'word_count', len(text))
    
    # Sentence count
    df.set_value(i, 'sentence_count', len(nltk.tokenize.sent_tokenize(essays[i])))
    
    # Average word length
    word_len = sum(len(word) for word in text) / len(text)
    df.set_value(i, 'avg_word_length', word_len)
    
    # Number of exclamation marks
    df.set_value(i, "num_exclamation_marks", sum(word.count("!") for word in essays[i]))
    
    # Number of question marks
    df.set_value(i, "num_question_marks", sum(word.count("?") for word in essays[i]))
    
    # Number of stop words
    df.set_value(i, "num_stopwords", sum([1 for word in text if word.lower() in stopwords]))

    
    # POS tag counts
    pos_lst = get_pos_tags(essays[i])
    df.set_value(i,'noun_count', pos_lst[0])
    df.set_value(i,'verb_count', pos_lst[1])
    df.set_value(i,'foreign_count', pos_lst[2])
    df.set_value(i,'adj_count', pos_lst[3])
    df.set_value(i,'adv_count', pos_lst[4])
    df.set_value(i,'conj_count', pos_lst[5])



In [6]:
from textblob import TextBlob
def avg_sentence_sentiment(x):
    sentiment_essay = TextBlob(x).sentiment.polarity
    return sentiment_essay
df['sentiment_essay'] = df['essay'].apply(avg_sentence_sentiment)

In [58]:
import language_check
def grammar_check(x):
    tool = language_check.LanguageTool('en-US')
    matches = tool.check(x)
    return len(matches)
df['Grammar_check'] = df['essay'].apply(grammar_check)

In [59]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,word_count,sentence_count,avg_word_length,num_exclamation_marks,num_question_marks,num_stopwords,noun_count,verb_count,foreign_count,adj_count,conj_count,adv_count,sentiment_essay,grade,grades,Grammar_check
0,1,1,"Dear local newspaper, I think effects computer...",66.666667,338.0,16.0,4.550296,4.0,2.0,168.0,120.0,39.0,0.0,13.0,69.0,22.0,0.310471,,,11
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",75.0,419.0,20.0,4.463007,1.0,1.0,189.0,148.0,56.0,0.0,14.0,80.0,21.0,0.274,,,19
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",58.333333,279.0,14.0,4.526882,0.0,0.0,140.0,110.0,33.0,0.0,13.0,50.0,17.0,0.340393,,,9
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",83.333333,524.0,27.0,5.041985,2.0,1.0,222.0,263.0,57.0,0.0,29.0,84.0,30.0,0.266828,,,35
4,5,1,"Dear @LOCATION1, I know having computers has a...",66.666667,465.0,30.0,4.526882,0.0,0.0,236.0,150.0,60.0,0.0,18.0,63.0,41.0,0.199684,,,17


In [60]:
x = df.drop(['domain1_score', 'essay','essay_id','essay_set'], axis=1)
# y = df['domain1_score']

# x = df_normalized.drop(['domain1_score'],axis=1)
# # df['A']=df['A'].fillna(0.0).astype(int)
y = df['domain1_score'].fillna(0.0).astype(int)
x = np.array(x)
y = np.array(y)
where_are_NaNs = np.isnan(x)
x[where_are_NaNs] = 0
where_are_NaNs = np.isnan(y)
y[where_are_NaNs] = 0

In [61]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, test_index in kfold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [62]:
predictions = logistic_reg.predict(X_test)
print('Logistic regression classifier accuracy:', logistic_reg.score(X_test, y_test))

Logistic regression classifier accuracy: 0.31805825242718444


In [64]:
from sklearn import linear_model
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index, test_index in kfold.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
lin_reg = linear_model.LinearRegression()
lin_reg.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [65]:
predictions = lin_reg.predict(X_test)
print('Linear regression classifier accuracy:', lin_reg.score(X_test, y_test))

Linear regression classifier accuracy: 0.22916389130549164


In [66]:
from sklearn import svm
clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('SVM (linear) Regressor Accuracy:', clf.score(X_test, y_test))

SVM (linear) Regressor Accuracy: 0.21532152291217324


In [67]:
from sklearn import svm
clf = svm.SVR(kernel="rbf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('SVM (rbf) Regressor Accuracy:', clf.score(X_test, y_test))



SVM (rbf) Regressor Accuracy: 0.11993845783691613


In [70]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=0,n_estimators=100)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [74]:
print ('Random Forest Regressor Accuracy:', model.score(X_test, y_test))

Random Forest Regressor Accuracy: 0.5202389704056603


## LSTM

In [107]:
import os
import pandas as pd
X = pd.read_excel('test.xlsx')
X=X.drop(X.columns[0:2],axis=1)
y = pd.DataFrame(X['domain1_score'])

In [101]:
X.head()

Unnamed: 0,essay,domain1_score
0,"Dear local newspaper, I think effects computer...",66.666667
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",75.0
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",58.333333
3,"Dear Local Newspaper, @CAPS1 I have found that...",83.333333
4,"Dear @LOCATION1, I know having computers has a...",66.666667


In [108]:
y.head()

Unnamed: 0,domain1_score
0,66.666667
1,75.0
2,58.333333
3,83.333333
4,66.666667


In [109]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    model = Sequential([
        # 2D tensor for first layer: 1 timestep and 300 features
        LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True),
        LSTM(64, recurrent_dropout=0.4),
        Dropout(0.5),
        Dense(1, activation='relu')
    ])

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [110]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

stopwords = set(stopwords.words('english'))

def essay_to_list(essay):
    # Remove the tags
    essay = re.sub("[^a-zA-Z]", " ", essay)
    words = essay.lower().split()
    return [w for w in words if not w in stopwords]

def essay_to_sentences(essay):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_list(raw_sentence))
    return sentences

# Generate feature vector for the words
def get_feature_vector(words, model, num_features, vec_type="sum"):
    feature_vector = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    
    max_vec =  np.zeros((num_features,),dtype="float32")
    min_vec =  np.ones((num_features,),dtype="float32")

    for word in words:
        if word in index2word_set:
            num_words += 1
            max_vec = np.maximum(model[word], feature_vector)
            min_vec = np.minimum(model[word], feature_vector)
            feature_vector = np.add(feature_vector, model[word]) 
    
    # return min vector + max vector
    if vec_type == "min+max":
        return np.add(min_vec, max_vec) 
    
    # average of vectors
    elif vec_type == "average":
        return np.divide(feature_vector, num_words)

    # default: return sum of word2vec vectors
    return feature_vector

# Generate word vectors from the mdoel
def generate_essay_vectors(essays, model, num_features, vec_type="sum"):
    essayfeature_vectors = np.zeros((len(essays),num_features),dtype="float32")
    for (i, essay) in enumerate(essays):
        essayfeature_vectors[i] = get_feature_vector(essay, model, num_features, vec_type)
    return essayfeature_vectors

In [111]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

def train_model(X, y, dataset, vec_type="sum"):
    count = 1
    results = []
    
    for train_set, test_set in dataset:
        print("Fold #", count)
        X_test, X_train, y_test, y_train = X.iloc[test_set], X.iloc[train_set], y.iloc[test_set], y.iloc[train_set]
        
        train_essays = X_train['essay']
        test_essays = X_test['essay']
        
        sentences = []
        
        for essay in train_essays:
            sentences += essay_to_sentences(essay)
                
        # Initialize variables for word2vec model
        num_features = 300 
        min_word_count = 40
        num_workers = 4
        context = 10
        downsampling = 1e-7

        # Train the word2vec model
        model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)
        model.init_sims(replace=True)
        
        # Generate training vectors
        clean_train_essays = []
        for essay_vec in train_essays:
            clean_train_essays.append(essay_to_list(essay_vec))
        train_vectors = generate_essay_vectors(clean_train_essays, model, num_features, vec_type)
        
        # Generate test vectors
        clean_test_essays = []
        for essay_vec in test_essays:
            clean_test_essays.append(essay_to_list( essay_vec))
        test_vectors = generate_essay_vectors(clean_test_essays, model, num_features, vec_type)
        
        train_vectors = np.array(train_vectors)
        test_vectors = np.array(test_vectors)

        # Reshape the train and test vectors to 3 dimensions - 1 represents one timestamp 
        train_vectors = np.reshape(train_vectors, (train_vectors.shape[0], 1, train_vectors.shape[1]))
        test_vectors = np.reshape(test_vectors, (test_vectors.shape[0], 1, test_vectors.shape[1]))
        
        # Call the LSTM to get the score predictions 
        lstm_model = get_model()
        lstm_model.fit(train_vectors, y_train, batch_size=64, epochs=50)
        y_pred = lstm_model.predict(test_vectors)
        
        # Round the prediction to the nearest integer
        y_pred = np.around(y_pred)
        
        # Evaluate the model: quadratic kappa score of predictions against human grading
        result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
        print("QWK: ", result)
        results.append(result)
        
        count += 1

    return results

In [112]:
y['domain1_score']=df['domain1_score'].fillna(0.0).astype(int)
dataset = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
dataset = dataset.split(X, y)
results_min_max = train_model(X, y, dataset, "min+max")
print("Average Quadratic Weighted Kappa after 5-fold cross validation for min + max word2vec ",np.around(np.array(results_min_max).mean(),decimals=4))



Fold # 1




Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1, 300)            721200    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
QWK:  0.7723076156805782
Fold # 3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 1, 300)            721200    
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                93440     
_____________________________________________

Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
QWK:  0.7606862851658096
Fold # 5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 1, 300)            721200    
_________________________________________________________________
lstm_10 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 814,705
Tr

In [114]:
dataset = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
dataset = dataset.split(X, y)
results_average = train_model(X, y, dataset, "average")
print("Average Quadratic Weighted Kappa after 5-fold cross validation for average word2vec ",np.around(np.array(results_average).mean(),decimals=4))



Fold # 1




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 1, 300)            721200    
_________________________________________________________________
lstm_12 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
E

Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
QWK:  0.4702928892754158
Fold # 4
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_17 (LSTM)               (None, 1, 300)            721200    
_________________________________________________________________
lstm_18 (LSTM)               (None, 64)                93440     
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_

Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
QWK:  0.48561460974355874
Average Quadratic Weighted Kappa after 5-fold cross validation for average word2vec  0.5016
