In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
DATASET_DIR = "../input/automated-essay-scoring-dataset/"
GLOVE_DIR = './glove.6B/'
SAVE_DIR = './'

In [None]:
X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

X.head()

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]


def toword(es, remove_stopwords):

    es = re.sub("[^a-zA-Z]", " ", es)
    words = es.lower().split()
    if remove_stopwords:
        stop = set(stopwords.words("english"))
        w = [w for w in words if not w in stop]
    return (w)


def tosentence(es, remove_stopwords):

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    initialsentence = tokenizer.tokenize(es.strip())
    s = []
    for r in initialsentence:
        if len(r) > 0:
            s.append(toword(r, remove_stopwords))
    return s


def make_feature(w, model, numberoffeaturess):
    f_vector = np.zeros((numberoffeaturess,),dtype="float32")
    numberofwords = 0.
    indexset = set(model.wv.index2word)
    for word in w :
        if word in indexset:
            numberofwords += 1
            f_vector = np.add(f_vector,model[word])
    return f_vector

def AverageFeature(essays, model, num_features):
   
    num_words = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[num_words] = make_feature(essay, model, num_features)
        num_words =num_words + 1
    return essayFeatureVecs


In [None]:
from tensorflow.keras.models import Sequential, load_model, model_from_config
from tensorflow.keras.layers import Embedding, LSTM, GRU,Dense, Dropout, Lambda, Flatten
#from tensorflow.keras.models import Sequential, load_model, model_from_config
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

def get_model():

    model = Sequential()
    model.add(GRU(200, dropout=0.6, recurrent_dropout=0.6, input_shape=[1, 200], return_sequences=True))
    model.add(GRU(64, recurrent_dropout=0.6))
    model.add(Dropout(0.7))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model



In [None]:
cv = KFold(n_splits=5, shuffle=True)
results = []
predictionlist = []

count = 1
for traincv, testcv in cv.split(X):

    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

    train = X_train['essay']
    test = X_test['essay']

    s = []
    for e in train:
        s +=tosentence(e, remove_stopwords = True)
    numberoffeaturess = 200
    minimumword = 40
    numberofworker = 4
    contet = 10
    downsampling = 1e-3
    model =Word2Vec(s, workers=numberofworker, size=numberoffeaturess, min_count = minimumword, window = contet, sample = downsampling)
    model.init_sims(replace=True)
    model.wv.save_word2vec_format("model1.bin", binary =True )
    c_train = []
    for es in train:
        c_train.append(toword(es, remove_stopwords=True))
    trainvector = AverageFeature(c_train, model, numberoffeaturess)
    c_test = []
    for es in test:
        c_test.append(toword( es, remove_stopwords=True ))
    testvector = AverageFeature( c_test, model, numberoffeaturess )
    trainvector = np.array(trainvector)
    testvector = np.array(testvector)
    trainvector = np.reshape(trainvector, (trainvector.shape[0], 1, trainvector.shape
    [1]))
    testvector = np.reshape(testvector, (testvector.shape[0], 1, testvector.shape[1]))

    model1 = get_model()
    model1.fit(trainvector, y_train, batch_size=64, epochs=15)
    yprediction = model1.predict(testvector)
    if count == 5:
         model1.save('./model_gru.h5')
    yprediction = np.around(yprediction)
    result = cohen_kappa_score(y_test.values,yprediction,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1



In [None]:
print("Average score ",np.around(np.array(results).mean(),decimals=2))

In [None]:
n1, n2, n3 = trainvector.shape
x_train = trainvector.reshape((n1,n2*n3))

In [None]:
from sklearn.ensemble import RandomForestClassifier
m1 = RandomForestClassifier(n_estimators= 200 , criterion='entropy')
m1.fit(x_train, y_train)

In [None]:
n4, n5, n6 = testvector.shape
x_test = testvector.reshape((n4,n5*n6))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn import metrics

In [None]:
y_pred1 = m1.predict(x_test )

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

In [None]:
print('randomforestaccurracy {:.2f}'
     .format(m1.score(x_test, y_test)))

In [None]:
from sklearn.svm import SVR

m2 = SVR()

m2.fit(x_train, y_train)

In [None]:
y_pred2 = m2.predict(x_test )

In [None]:
print(' svr accurracy {:.2f}'
     .format(m2.score(x_test, y_test)))