Load the datasets and perform exploratory data analysis

In [2]:
# !pip install transformers[torch] transformers torch pandas numpy scikit-learn tqdm docx

In [14]:
import pandas as pd
from datasets import Dataset
import numpy as np
from datasets import load_from_disk, concatenate_datasets
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification,
)

from bert_scoring_model import BertScoringModel


from qwk import quadratic_weighted_kappa

import os
import torch
from score_essays import score_essay
from copy import deepcopy
from docx import Document

from tqdm.notebook import tqdm
tqdm.pandas()

In [15]:
train_data = pd.read_csv("data/train_rel_2.tsv", sep="\t")
test_data = pd.read_csv("data/public_leaderboard_rel_2.tsv", sep="\t")
test_data_scores = pd.read_csv("data/public_leaderboard_solution.csv")

Test set descriptions and scores are in different files, need to be merged.

In [16]:
test_data.head()

Unnamed: 0,Id,EssaySet,EssayText
0,1673,1,The procedures I think they should have includ...
1,1674,1,"In order to replicate this experiment, you wou..."
2,1675,1,"In order to replicate their experiment, you wo..."
3,1676,1,Pleace a simple of one material into one conta...
4,1677,1,Determin the mass of four different samples ma...


In [17]:
test_data_scores.head()

Unnamed: 0,Id,EssaySet,EssayWeights,Score1,Usage
0,1673,1,1,1,PublicTest
1,1674,1,1,1,PublicTest
2,1675,1,1,3,PublicTest
3,1676,1,1,0,PublicTest
4,1677,1,1,0,PublicTest


In [18]:
test_data = test_data.merge(test_data_scores, on=['Id',  'EssaySet'],  how='left')
test_data.drop('Usage', axis=1, inplace=True)
test_data.head()

Unnamed: 0,Id,EssaySet,EssayText,EssayWeights,Score1
0,1673,1,The procedures I think they should have includ...,1,1
1,1674,1,"In order to replicate this experiment, you wou...",1,1
2,1675,1,"In order to replicate their experiment, you wo...",1,3
3,1676,1,Pleace a simple of one material into one conta...,1,0
4,1677,1,Determin the mass of four different samples ma...,1,0


In [19]:
train_data.drop('Score2', axis=1, inplace=True)
train_data.head()

Unnamed: 0,Id,EssaySet,Score1,EssayText
0,1,1,1,Some additional information that we would need...
1,2,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,"What you need is more trials, a control set up..."
3,4,1,0,The student should list what rock is better an...
4,5,1,2,For the students to be able to make a replicat...


In [20]:
train_data.describe()

Unnamed: 0,Id,EssaySet,Score1
count,17043.0,17043.0,17043.0
mean,13820.561286,5.609576,0.926245
std,8256.441258,2.822468,0.893137
min,1.0,1.0,0.0
25%,6275.5,3.0,0.0
50%,14270.0,6.0,1.0
75%,20928.5,8.0,2.0
max,27588.0,10.0,3.0


In [21]:
train_data['Score1'].value_counts()

Score1
0    6731
1    5579
2    3992
3     741
Name: count, dtype: int64

In [22]:
train_data.head()

Unnamed: 0,Id,EssaySet,Score1,EssayText
0,1,1,1,Some additional information that we would need...
1,2,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,"What you need is more trials, a control set up..."
3,4,1,0,The student should list what rock is better an...
4,5,1,2,For the students to be able to make a replicat...


Cross-validation split and train

In [23]:
NUM_ESSAY_SETS = max(train_data['EssaySet']) + 1

In [24]:
def load_docx_files_in_order(directory):
    task_descriptions = {}
    
    filenames = [filename for filename in os.listdir(directory) if filename.startswith("Data Set") and filename.endswith("--ReadMeFirst.docx")]
    
    filenames.sort(key=lambda x: int(x.split('#')[1].split('--')[0].strip()))

    for filename in filenames:
        file_path = os.path.join(directory, filename)
        doc = Document(file_path)
        
        text = []
        for para in doc.paragraphs:
            text.append(para.text.strip())
            
        task_descriptions[filename] = "\n".join(text)
    
    ordered_descriptions = {i + 1: task_descriptions[filename] for i, filename in enumerate(filenames)}
    
    return ordered_descriptions

directory_path = 'task_descriptions/'
task_descriptions = load_docx_files_in_order(directory_path)


In [25]:
train_data['Grading Scale'] = train_data.groupby('EssaySet')['Score1'].transform(lambda x: f"{x.min()} to {x.max()}")
test_data['Grading Scale'] = test_data.groupby('EssaySet')['Score1'].transform(lambda x: f"{x.min()} to {x.max()}")

train_data['EssayText'] = train_data.apply(lambda row: f"Task description: {task_descriptions[row['EssaySet']]}. Grade this student's answer on a scale {row['Grading Scale']}, taking into account grammar, lexical variability, and task relevance. Student's answer: {row['EssayText']}", axis=1)
test_data['EssayText'] = test_data.apply(lambda row: f"Grade this answer on a scale {row['Grading Scale']}: {row['EssayText']}", axis=1)

In [28]:
train_data

Unnamed: 0,Id,EssaySet,Score1,EssayText,Grading Scale
0,1,1,1,Task description: Data Set #1\nPrompt—Acid Rai...,0 to 3
1,2,1,1,Task description: Data Set #1\nPrompt—Acid Rai...,0 to 3
2,3,1,1,Task description: Data Set #1\nPrompt—Acid Rai...,0 to 3
3,4,1,0,Task description: Data Set #1\nPrompt—Acid Rai...,0 to 3
4,5,1,2,Task description: Data Set #1\nPrompt—Acid Rai...,0 to 3
...,...,...,...,...,...
17038,27584,10,1,Task description: Data Set #10\nPrompt—Doghous...,0 to 2
17039,27585,10,1,Task description: Data Set #10\nPrompt—Doghous...,0 to 2
17040,27586,10,1,Task description: Data Set #10\nPrompt—Doghous...,0 to 2
17041,27587,10,1,Task description: Data Set #10\nPrompt—Doghous...,0 to 2


In [29]:
batch_size = 32
epochs = 50
patience = 5

In [None]:
all_tokenized_train_datasets = []
all_tokenized_val_datasets = []

for essay_set in range(1, NUM_ESSAY_SETS):
    
    split_ratio = 0.8
    
    essay_set_train_data = train_data[train_data['EssaySet'] == essay_set].sample(frac=split_ratio)
    
    essay_set_validation_data = train_data[train_data['EssaySet'] == essay_set].drop(essay_set_train_data.index)
    
    train_dataset = Dataset.from_dict({
        "EssayText" : essay_set_train_data["EssayText"].values,
        "Id": essay_set_train_data["Score1"].values,
        "EssaySet": essay_set_train_data["EssaySet"].values,
        "Score1": essay_set_train_data["Score1"].values,
    })
    
    validation_dataset = Dataset.from_dict({
        "EssayText" : essay_set_validation_data["EssayText"].values,
        "Id": essay_set_validation_data["Score1"].values,
        "EssaySet": essay_set_validation_data["EssaySet"].values,
        "Score1": essay_set_validation_data["Score1"].values,
    })

    if essay_set == 1:
        num_of_labels = train_data['Score1'].nunique()
        scoring_model = BertScoringModel(num_labels=num_of_labels)

    tokenized_train_dataset = scoring_model.get_tokenized_dataset(
        train_dataset, is_train=True, essay_set=essay_set
    )
    
    tokenized_val_dataset = scoring_model.get_tokenized_dataset(
        validation_dataset, is_train=False, essay_set=essay_set
    )
    
    all_tokenized_train_datasets.append(tokenized_train_dataset)
    all_tokenized_val_datasets.append(tokenized_val_dataset)

combined_train_dataset = concatenate_datasets(all_tokenized_train_datasets)
combined_val_dataset = concatenate_datasets(all_tokenized_val_datasets)

evaluation_results = scoring_model.train(
    combined_train_dataset, combined_val_dataset, essay_set='all_sets', 
    batch_size=batch_size, epochs=epochs, patience=patience
)

print(f"Final evaluation results: {evaluation_results}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing and saving dataset to data/tokenized_data/tokenized_set1_train


Map:   0%|          | 0/1338 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1338 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set1_test


Map:   0%|          | 0/334 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/334 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set2_train


Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1022 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set2_test


Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set3_train


Map:   0%|          | 0/1446 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1446 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set3_test


Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/362 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set4_train


Map:   0%|          | 0/1326 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1326 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set4_test


Map:   0%|          | 0/331 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/331 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set5_train


Map:   0%|          | 0/1436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1436 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set5_test


Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/359 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set6_train


Map:   0%|          | 0/1438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1438 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set6_test


Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/359 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set7_train


Map:   0%|          | 0/1439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1439 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set7_test


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/360 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set8_train


Map:   0%|          | 0/1439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1439 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set8_test


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/360 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set9_train


Map:   0%|          | 0/1438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1438 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set9_test


Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/360 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set10_train


Map:   0%|          | 0/1312 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1312 [00:00<?, ? examples/s]

Tokenizing and saving dataset to data/tokenized_data/tokenized_set10_test


Map:   0%|          | 0/328 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/328 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.9831,0.964497
2,0.9826,0.972375
3,0.9626,0.946157
4,0.941,0.95043
5,0.9002,0.953919


Generate predictions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_model_path = "best_models/bert-base-uncased_setall_sets/"

model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
    
model.to(device)
model.eval()

def score_essay(essay_text):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    inputs = tokenizer(
            essay_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
        )
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_score = torch.argmax(logits, dim=-1).item()
    return predicted_score

In [None]:
test_data['Predicted Score'] = test_data.progress_apply(lambda row: score_essay(row['EssayText']), axis=1)

In [None]:
test_data

In [32]:
test_data.to_csv('data/test_set_with_predictions.csv')

In [35]:
qwk_list = []

test_set = pd.read_csv("data/test_set_with_predictions.csv")
for essay_set in range(1, NUM_ESSAY_SETS):

    test_set_filtered = test_set[test_set['EssaySet'] == essay_set]

    unique_labels = test_set_filtered['Score1'].nunique()
    
    qwk = quadratic_weighted_kappa(
        test_set_filtered['Score1'], 
        test_set_filtered['Predicted Score'], 
        min_rating=0, 
        max_rating=unique_labels
    )
    
    print(f"Essay Set {essay_set}: QWK = {qwk}")
    qwk_list.append(qwk)

print("Avg QWK:", np.mean(qwk_list))

Essay Set 1: QWK = 0.76069976299373
Essay Set 2: QWK = 0.6816282557106876
Essay Set 3: QWK = 0.5194318405997238
Essay Set 4: QWK = 0.6142497904442581
Essay Set 5: QWK = 0.6168183788469874
Essay Set 6: QWK = 0.7284598736546561
Essay Set 7: QWK = 0.6582621901165655
Essay Set 8: QWK = 0.5477517083193959
Essay Set 9: QWK = 0.8039125437195256
Essay Set 10: QWK = 0.6973165725244594
Avg QWK: 0.6628530916929989
