In [1]:
# Importing necessary libraries and packages

import os
import re
import zipfile

import nltk
import numpy as np
import pandas as pd
import requests
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    cohen_kappa_score,
    explained_variance_score,
    mean_squared_error,
)
from sklearn.model_selection import KFold, train_test_split
from sklearn.svm import SVR
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def download_dataset_file(url, destination_path):
    """
    Downloads a file from the specified URL and saves it to the given path.
    """
    if not os.path.exists(destination_path):
        response = requests.get(url)
        with open(destination_path, "wb") as file:
            file.write(response.content)


# Create a directory for the dataset
os.makedirs("Dataset", exist_ok=True)

download_dataset_file(
    "https://www.dropbox.com/scl/fi/0s4skjwowniwy1pn3uwaz/essay_scorer_dataset.zip?rlkey=uhcmx6z82llbkc7hs24ww97fl&st=l55tyxi3&dl=1",
    "Dataset/essay_scorer_dataset.zip",
)

In [3]:
# Function to unzip the dataset file
def extract_zip_file(zip_file_path=None):
    """
    Extracts the contents of a zip file into the 'Dataset' directory.
    """
    try:
        with zipfile.ZipFile(zip_file_path, "r") as z:
            z.extractall("Dataset")
            print("Extraction completed successfully.")
    except zipfile.BadZipFile:
        print("Error: Invalid zip file provided.")


extract_zip_file("Dataset/essay_scorer_dataset.zip")

Extraction completed successfully.


In [4]:
# Load dataset
training_data = pd.read_csv("Dataset/training_set.tsv", sep="\t", encoding="ISO-8859-1")

# Extract dependent variable
target_scores = training_data["domain1_score"]
essays_df = training_data.loc[:, ["essay_id", "essay_set", "essay", "domain1_score"]]

In [5]:
# Tokenize words from essays after cleaning text by removing non-alphabetic characters,
# converting to lowercase, and removing stopwords


def tokenize_words(essay_text):
    """
    Cleans and tokenizes words from the provided essay text.
    """
    clean_text = re.sub("[^a-zA-Z]", " ", essay_text)
    words = clean_text.lower().split()
    stop_words = set(stopwords.words("english"))
    filtered_words = [w for w in words if w not in stop_words]
    return filtered_words

In [6]:
# Tokenize sentences from essays and subsequently tokenize words within those sentences


def tokenize_sentences(essay_text):
    """
    Tokenizes sentences from the provided essay text, and then tokenizes words within those sentences.
    """
    tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
    sentences = tokenizer.tokenize(essay_text.strip())
    tokenized_sentences = [
        tokenize_words(sentence) for sentence in sentences if len(sentence) > 0
    ]
    return tokenized_sentences

In [7]:
# Generate a feature vector for the provided words


def generate_feature_vector(words, model, num_features):
    """
    Generates a feature vector by averaging word vectors from the provided word list.
    """
    feature_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0.0
    model_vocab = set(model.wv.index_to_key)

    for word in words:
        if word in model_vocab:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if num_words > 0:
        feature_vector = np.divide(feature_vector, num_words)

    return feature_vector

In [8]:
# Generate average feature vectors for a list of essays


def generate_avg_feature_vectors(essays, model, num_features):
    """
    Generates average feature vectors for each essay in the list of essays.
    """
    essay_feature_vectors = np.zeros((len(essays), num_features), dtype="float32")

    for i, essay_text in enumerate(essays):
        essay_feature_vectors[i] = generate_feature_vector(
            essay_text, model, num_features
        )

    return essay_feature_vectors

In [9]:
def build_lstm_model():
    """
    Builds and compiles an LSTM model for essay scoring.
    """
    lstm_model = Sequential()
    lstm_model.add(
        LSTM(
            300,
            dropout=0.4,
            recurrent_dropout=0.4,
            input_shape=[1, 300],
            return_sequences=True,
        )
    )
    lstm_model.add(LSTM(64, recurrent_dropout=0.4))
    lstm_model.add(Dropout(0.5))
    lstm_model.add(Dense(1, activation="relu"))

    lstm_model.compile(loss="mean_squared_error", optimizer="rmsprop", metrics=["mae"])
    lstm_model.summary()

    return lstm_model

In [10]:
# Applying k-fold cross-validation

cross_validator = KFold(n_splits=5, shuffle=True, random_state=42)
evaluation_results = []
predicted_scores = []

fold_counter = 1
for train_indices, test_indices in cross_validator.split(essays_df):

    print("\n------------ Fold {} ------------\n".format(fold_counter))
    train_set, test_set = essays_df.iloc[train_indices], essays_df.iloc[test_indices]
    y_train, y_test = (
        target_scores.iloc[train_indices],
        target_scores.iloc[test_indices],
    )

    train_essays = train_set["essay"]
    test_essays = test_set["essay"]

    # Tokenize sentences from training essays
    all_sentences = []
    for essay in train_essays:
        all_sentences += tokenize_sentences(essay)

    # Word2Vec model parameters
    vector_size = 300
    min_word_count = 40
    num_workers = 4
    context_window = 10
    downsampling = 1e-3

    print("Training Word2Vec model...")
    word2vec_model = Word2Vec(
        all_sentences,
        workers=num_workers,
        vector_size=vector_size,
        min_count=min_word_count,
        window=context_window,
        sample=downsampling,
    )

    word2vec_model.init_sims(replace=True)
    word2vec_model.wv.save_word2vec_format("word2vec_model.bin", binary=True)

    # Generate feature vectors for training and testing sets
    clean_train_essays = [tokenize_words(essay) for essay in train_essays]
    train_data_vectors = generate_avg_feature_vectors(
        clean_train_essays, word2vec_model, vector_size
    )

    clean_test_essays = [tokenize_words(essay) for essay in test_essays]
    test_data_vectors = generate_avg_feature_vectors(
        clean_test_essays, word2vec_model, vector_size
    )

    train_data_vectors = np.reshape(
        train_data_vectors,
        (train_data_vectors.shape[0], 1, train_data_vectors.shape[1]),
    )
    test_data_vectors = np.reshape(
        test_data_vectors, (test_data_vectors.shape[0], 1, test_data_vectors.shape[1])
    )

    lstm_model = build_lstm_model()
    lstm_model.fit(train_data_vectors, y_train, batch_size=64, epochs=50)
    predicted_y = lstm_model.predict(test_data_vectors)

    # Round predicted values to nearest integer
    predicted_y = np.around(predicted_y)

    """Evaluation metrics used:
    1. Mean squared error
    2. Explained variance score
    3. Cohen's kappa score
    Expected results: Minimum error, maximum variance, and maximum kappa score."""

    # Mean squared error
    print(
        "Mean squared error: {0:.2f}".format(
            mean_squared_error(y_test.values, predicted_y)
        )
    )

    # Explained variance score
    print(
        "Explained variance score: {0:.2f}".format(
            explained_variance_score(y_test.values, predicted_y)
        )
    )

    # Cohen's kappa score
    kappa_score = cohen_kappa_score(y_test.values, predicted_y, weights="quadratic")
    print("Cohen's Kappa Score: {0:.2f}".format(kappa_score))
    evaluation_results.append(kappa_score)

    fold_counter += 1


------------ Fold 1 ------------

Training Word2Vec model...


  word2vec_model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - loss: 95.8304 - mae: 5.3998
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 45.2673 - mae: 3.7406
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 37.9313 - mae: 3.6536
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 31.9347 - mae: 3.5094
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 30.2155 - mae: 3.3645
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 27.3598 - mae: 3.0954
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 26.7014 - mae: 3.0206
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 21.9366 - mae: 2.7214
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  word2vec_model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - loss: 87.4932 - mae: 5.2520
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 45.7191 - mae: 3.7325
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 35.7320 - mae: 3.5276
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 30.6826 - mae: 3.3705
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 30.7478 - mae: 3.3536
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 30.5234 - mae: 3.1944
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 26.5052 - mae: 2.9648
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 22.1246 - mae: 2.7226
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  word2vec_model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 87.3216 - mae: 5.2189
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 46.1491 - mae: 3.8168
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 37.1777 - mae: 3.6397
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 32.9034 - mae: 3.5224
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 29.6104 - mae: 3.3094
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 27.0232 - mae: 3.0984
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 25.8922 - mae: 2.9540
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 22.2884 - mae: 2.7144
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  word2vec_model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 87.0792 - mae: 5.1896
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 42.0736 - mae: 3.6845
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 33.5755 - mae: 3.4645
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 32.9684 - mae: 3.5781
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 28.0450 - mae: 3.2673
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 27.3234 - mae: 3.0977
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 25.7078 - mae: 2.9767
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 21.8448 - mae: 2.6973
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  word2vec_model.init_sims(replace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 82.7172 - mae: 5.0215
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 44.1996 - mae: 3.7179
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - loss: 33.5021 - mae: 3.4335
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 30.0571 - mae: 3.3565
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 28.8367 - mae: 3.3017
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 25.5255 - mae: 3.0147
Epoch 7/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - loss: 23.2486 - mae: 2.8455
Epoch 8/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - loss: 21.7273 - mae: 2.6986
Epoch 9/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [11]:
print(
    "Average Cohen's Kappa score after 5-fold cross-validation: ",
    np.around(np.mean(evaluation_results), decimals=2),
)

Average Cohen's Kappa score after 5-fold cross-validation:  0.96


In [12]:
# Splitting dataset into training and test set and generating word embeddings for other models

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    training_data, target_scores, test_size=0.25
)

train_essays_split = X_train_split["essay"]
test_essays_split = X_test_split["essay"]

sentences_split = []

for essay_split in train_essays_split:
    sentences_split += tokenize_sentences(essay_split)

# Initializing variables for Word2Vec model
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

print("Training Word2Vec Model...")
word2vec_model = Word2Vec(
    sentences_split,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling,
)

word2vec_model.init_sims(replace=True)
word2vec_model.wv.save_word2vec_format("word2vec_model.bin", binary=True)

clean_train_essays_split = []

# Generate training and testing data word vectors
for essay_text_split in train_essays_split:
    clean_train_essays_split.append(tokenize_words(essay_text_split))
trainDataVecs_split = generate_avg_feature_vectors(
    clean_train_essays_split, word2vec_model, num_features
)

clean_test_essays_split = []
for essay_text_split in test_essays_split:
    clean_test_essays_split.append(tokenize_words(essay_text_split))
testDataVecs_split = generate_avg_feature_vectors(
    clean_test_essays_split, word2vec_model, num_features
)

trainDataVecs_split = np.array(trainDataVecs_split)
testDataVecs_split = np.array(testDataVecs_split)

Training Word2Vec Model...


  word2vec_model.init_sims(replace=True)


In [13]:
# Generating scores using Linear Regression Model

linear_regressor_split = LinearRegression()

linear_regressor_split.fit(trainDataVecs_split, y_train_split)

y_pred_split = linear_regressor_split.predict(testDataVecs_split)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test_split, y_pred_split))

# Explained variance score: 1 is perfect prediction
print("Variance score: %.2f" % explained_variance_score(y_test_split, y_pred_split))

# Cohen's kappa score
print(
    "Kappa Score: {0:.2f}".format(
        cohen_kappa_score(
            y_test_split.values, np.around(y_pred_split), weights="quadratic"
        )
    )
)

Mean squared error: 19.76
Variance score: 0.75
Kappa Score: 0.86


In [14]:
# Generating scores using Gradient Boosting Regressor

gbr_split = ensemble.GradientBoostingRegressor(
    alpha=0.9,
    criterion="friedman_mse",
    init=None,
    learning_rate=0.1,
    loss="squared_error",
    max_depth=2,
    max_features=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=1000,
    random_state=None,
    subsample=1.0,
    verbose=0,
    warm_start=False,
)
gbr_split.fit(trainDataVecs_split, y_train_split)
y_pred_split = gbr_split.predict(testDataVecs_split)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test_split, y_pred_split))

# Explained variance score: 1 is perfect prediction
print("Variance score: %.2f" % explained_variance_score(y_test_split, y_pred_split))

# Cohen's kappa score
print(
    "Kappa Score: {0:.2f}".format(
        cohen_kappa_score(
            y_test_split.values, np.around(y_pred_split), weights="quadratic"
        )
    )
)

Mean squared error: 6.83
Variance score: 0.91
Kappa Score: 0.96


In [15]:
# Generating scores using Support Vector Regression (SVR)

svr_split = SVR(
    C=100,
    cache_size=200,
    coef0=0.0,
    degree=3,
    epsilon=0.1,
    gamma=0.1,
    kernel="rbf",
    max_iter=-1,
    shrinking=True,
    tol=0.001,
    verbose=False,
)
svr_split.fit(trainDataVecs_split, y_train_split)
y_pred_split = svr_split.predict(testDataVecs_split)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test_split, y_pred_split))

# Explained variance score: 1 is perfect prediction
print("Variance score: %.2f" % explained_variance_score(y_test_split, y_pred_split))

# Cohen's kappa score
print(
    "Kappa Score: {0:.2f}".format(
        cohen_kappa_score(
            y_test_split.values, np.around(y_pred_split), weights="quadratic"
        )
    )
)

Mean squared error: 28.94
Variance score: 0.64
Kappa Score: 0.74


In [16]:
# As LSTM outperforms all other models, using it for predicting the scores for the final dataset
validation_set = pd.read_csv("Dataset/valid_set.tsv", sep="\t", encoding="ISO-8859-1")

In [17]:
validation_set = validation_set.drop(["domain2_predictionid"], axis=1)

In [18]:
valid_test_essays = validation_set["essay"]

In [19]:
sentences_valid = []

for valid_essay in valid_test_essays:
    sentences_valid += tokenize_sentences(valid_essay)

print("Training Word2Vec Model...")
word2vec_model = Word2Vec(
    sentences_valid,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling,
)

word2vec_model.init_sims(replace=True)
word2vec_model.wv.save_word2vec_format("word2vec_model.bin", binary=True)

clean_valid_test_essays = []

# Generate testing data word vectors
for essay_text_valid in valid_test_essays:
    clean_valid_test_essays.append(tokenize_words(essay_text_valid))
valid_testDataVecs = generate_avg_feature_vectors(
    clean_valid_test_essays, word2vec_model, num_features
)

valid_testDataVecs = np.array(valid_testDataVecs)
# Reshaping test vectors to 3 dimensions (1 represents one timestep)
valid_testDataVecs = np.reshape(
    valid_testDataVecs, (valid_testDataVecs.shape[0], 1, valid_testDataVecs.shape[1])
)

predicted_scores_valid = lstm_model.predict(valid_testDataVecs)

# Round predicted scores to the nearest integer
predicted_scores_valid = np.around(predicted_scores_valid)

Training Word2Vec Model...


  word2vec_model.init_sims(replace=True)


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [20]:
submission = validation_set.drop(["essay"], axis=1)

In [21]:
predicted_score_series = pd.Series(
    [score for sublist in predicted_scores_valid for score in sublist]
)

In [22]:
submission = (
    pd.concat([submission, predicted_score_series], axis=1)
    .rename(columns={0: "predicted_score"})
    .iloc[:, [2, 0, 1, 3]]
)
submission.to_excel("Submission.xlsx", index=False)