# Essay Grading

We have a list of essays and the score. Our goal is to use machine learning to predict future grades. 

### Methods used
- LSTM
- Linear Regression
- KNN

In [5]:
!pip install nltk 
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.4-py3-none-any.whl.metadata (23 kB)
Downloading gensim-4.3.2-cp311-cp311-macosx_10_9_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.4-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m497.5 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.4


In [16]:
#for pre-processing
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
#for model training

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
import keras.backend as K



from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cedrickperron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cedrickperron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
test_data = pd.read_csv("./asap-aes/test_set.tsv",sep='\t', encoding='ISO-8859-1')
training_data = pd.read_csv("./asap-aes/training_set_rel3.tsv",sep='\t', encoding='ISO-8859-1',
                            usecols = ['essay_id', 'essay_set', 'essay','domain1_score']).dropna(axis=1)
valid_data = pd.read_csv("./asap-aes/valid_set.tsv",sep='\t', encoding='ISO-8859-1')

In [7]:
test_data.dropna(axis=1,inplace=True)
valid_data.dropna(axis=1,inplace=True)

In [8]:
training_data
y = training_data['domain1_score']
X = training_data.copy()
X,y;

In [18]:
def essay_to_wordlist(essay_v, remove_stopwords):
    # Remove the tagged labels and word tokenize the sentence.
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences


# Define functions to create feature vectors
def makeFeatureVec(words, model, num_features):
    """Function to average all of the word vectors in a given paragraph"""
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0.
    index_to_key_set = set(model.wv.index_to_key)  # Updated line
    for word in words:
        if word in index_to_key_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model.wv[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def get_word_vectors(essays, model):
    """
    Get word vectors for essays using a Word2Vec model.
    
    Args:
    - essays: A list of essays.
    - model: A Word2Vec model.
    
    Returns:
    - data_vecs: A 3D array of word vectors for each essay.
    - clean_essays: A list of cleaned essays.
    """
    data_vecs = []
    clean_essays = []
    
    for essay in essays:
        clean_essay = essay_to_wordlist(essay, remove_stopwords=True)
        essay_vecs = []
        for word in clean_essay:
            if word in model.wv:
                essay_vecs.append(model.wv[word])
        data_vecs.append(essay_vecs)
        clean_essays.append(clean_essay)
    
    # Convert data_vecs to a 3D array
    data_vecs = np.array([np.array(doc) for doc in data_vecs])
    
    return data_vecs, clean_essays


In [19]:
def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

In [20]:
def get_linear_regressor_model(input_dim):
    model = Sequential()
    model.add(Dense(1, input_dim=input_dim, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Usage example
input_dim = 300  # Assuming input dimensionality is 300 for Word2Vec features
linear_regressor_model = get_linear_regressor_model(input_dim)


In [23]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score



cv = KFold(n_splits = 5, shuffle = True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 5 models.
    if count == 5:
         lstm_model.save('./final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Accuracy"
    acc = accuracy_score(y_test.values, y_pred)
    print("Accuracy Score: {}".format(acc))
    results.append(acc)

    count += 1

# Print average accuracy
print("\nAverage Accuracy: {}".format(np.mean(results)))


--------Fold 1--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - loss: 89.5817 - mae: 5.2756
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 43.7723 - mae: 3.6945
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 36.2038 - mae: 3.5629
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 32.1258 - mae: 3.4779
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - loss: 29.4453 - mae: 3.2736
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 30.1866 - mae: 3.2653
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 26.4510 - mae: 2.9902
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 24.0737 - mae: 2.7947
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - loss: 88.0908 - mae: 5.1862
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 45.4965 - mae: 3.7810
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 37.5500 - mae: 3.6154
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 31.1833 - mae: 3.4210
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - loss: 31.2786 - mae: 3.4245
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - loss: 28.1149 - mae: 3.1561
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - loss: 26.6918 - mae: 3.0269
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - loss: 20.9591 - mae: 2.6882
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 83.2243 - mae: 5.0656
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 42.9073 - mae: 3.6382
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 33.8114 - mae: 3.4779
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 30.2849 - mae: 3.3738
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 28.5081 - mae: 3.3083
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 27.1082 - mae: 3.1301
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - loss: 25.8415 - mae: 2.9622
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 24.5959 - mae: 2.8510
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 86.2430 - mae: 5.1628
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 46.6457 - mae: 3.8068
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 35.6740 - mae: 3.5391
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 32.0059 - mae: 3.4975
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 28.5972 - mae: 3.3121
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - loss: 27.2395 - mae: 3.1254
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 26.2533 - mae: 2.9803
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - loss: 24.1394 - mae: 2.7768
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 88.3868 - mae: 5.2439
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 42.7935 - mae: 3.6618
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 34.4756 - mae: 3.5135
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 30.8312 - mae: 3.4262
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 29.2763 - mae: 3.2861
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - loss: 26.7661 - mae: 3.0995
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 25.1390 - mae: 2.9094
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 22.6729 - mae: 2.7285
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0



Accuracy Score: 0.3202312138728324

Average Accuracy: 0.3242896586121824


In [24]:
cv = KFold(n_splits=5, shuffle=True)
results_linear_regression = []
results_knn = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]

    train_essays = X_train['essay']
    test_essays = X_test['essay']

    sentences = []

    for essay in train_essays:
        # Obtaining all sentences from the training essays.
        sentences += essay_to_sentences(essay, remove_stopwords=True)

    # Initializing variables for word2vec model.
    num_features = 300
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context, sample=downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []

    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    testDataVecs = getAvgFeatureVecs(clean_test_essays, model, num_features)

    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represents one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

    # Linear Regression
    linear_regression = LinearRegression()
    linear_regression.fit(trainDataVecs.squeeze(), y_train)
    y_pred_linear_regression = linear_regression.predict(testDataVecs.squeeze())
    acc_linear_regression = accuracy_score(y_test.values, np.around(y_pred_linear_regression))
    print("Linear Regression Accuracy Score: {}".format(acc_linear_regression))
    results_linear_regression.append(acc_linear_regression)

    # KNN Classifier
    knn_classifier = KNeighborsClassifier()
    knn_classifier.fit(trainDataVecs.squeeze(), y_train)
    y_pred_knn = knn_classifier.predict(testDataVecs.squeeze())
    acc_knn = accuracy_score(y_test.values, y_pred_knn)
    print("KNN Classifier Accuracy Score: {}".format(acc_knn))
    results_knn.append(acc_knn)

    count += 1

# Print average accuracy for Linear Regression and KNN Classifier
print("\nAverage Accuracy Linear Regression: {}".format(np.mean(results_linear_regression)))
print("Average Accuracy KNN Classifier: {}".format(np.mean(results_knn)))



--------Fold 1--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Linear Regression Accuracy Score: 0.12942989214175654
KNN Classifier Accuracy Score: 0.3852080123266564

--------Fold 2--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Linear Regression Accuracy Score: 0.1283236994219653
KNN Classifier Accuracy Score: 0.3714836223506744

--------Fold 3--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Linear Regression Accuracy Score: 0.12369942196531791
KNN Classifier Accuracy Score: 0.36608863198458574

--------Fold 4--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Linear Regression Accuracy Score: 0.12023121387283237
KNN Classifier Accuracy Score: 0.3880539499036609

--------Fold 5--------

Training Word2Vec Model...


  model.init_sims(replace=True)


Linear Regression Accuracy Score: 0.12177263969171484
KNN Classifier Accuracy Score: 0.36184971098265895

Average Accuracy Linear Regression: 0.12469137341871739
Average Accuracy KNN Classifier: 0.37453678550964725
