In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, cohen_kappa_score
from scipy.stats import spearmanr, pearsonr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import json




In [3]:
DATA_DIR = "./data"
MODEL_DIR = "./models"
RESULT_DIR = "./results"

In [5]:
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

In [9]:
def load_data(file_path):
    return pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')

In [11]:
def preprocess_text(data):
    data['essay'] = data['essay'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
    return data

In [13]:
def tokenize_essays(texts, tokenizer, max_length=512):
    encodings = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="tf",
    )
    return encodings

In [15]:
# Metrics Calculation
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    spearman = spearmanr(y_true, y_pred).correlation
    pearson = pearsonr(y_true, y_pred)[0]
    return {"MSE": mse, "R²": r2, "Spearman": spearman, "Pearson": pearson}