In [106]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr, pearsonr
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier, KerasRegressor
from skopt import BayesSearchCV
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import cohen_kappa_score

In [108]:
def quadratic_weighted_kappa(y_true, y_pred):
    """Calculate Quadratic Weighted Kappa."""
    y_true = np.round(y_true).astype(int)
    y_pred = np.round(y_pred).astype(int)
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [110]:
def load_data(file_path):
    return pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')

In [112]:
def preprocess_text(data):
    """Preprocess text data."""
    data['essay'] = data['essay'].astype(str).str.strip() 
    return data

In [128]:
def tokenize_essays(texts, tokenizer, max_length=512):
    """Tokenize essays using BERT tokenizer."""
    print("Starting tokenization...")
    encodings = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )
    print("Tokenization completed.")
    print(f"Sample input_ids: {encodings['input_ids'][0]}")
    print(f"Sample attention_mask: {encodings['attention_mask'][0]}")
    return encodings

In [130]:
def build_bert_lstm_model(learning_rate=1e-4, lstm_units=128, dropout_rate=0.3):
    """Build a BERT + LSTM model."""
    # Load pre-trained BERT model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    for layer in bert_model.layers:
        layer.trainable = False  # Freeze BERT layers for efficiency

    # Input layers
    input_ids = Input(shape=(512,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(512,), dtype=tf.int32, name="attention_mask")

    # BERT embeddings
    bert_output = bert_model(input_ids, attention_mask=attention_mask).last_hidden_state

    # LSTM layer
    lstm_output = LSTM(lstm_units, return_sequences=False)(bert_output)

    # Dense regression head
    dropout = Dropout(dropout_rate)(lstm_output)
    output = Dense(1, activation='linear')(dropout)

    # Model definition
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])

    return model


In [132]:
def hyperparameter_tuning(X, y, tokenizer):
    """Perform hyperparameter tuning using Bayesian Optimization."""
    def model_fn(learning_rate, lstm_units, dropout_rate):
        return build_bert_lstm_model(
            learning_rate=learning_rate,
            lstm_units=int(lstm_units),
            dropout_rate=dropout_rate
        )

    model = KerasRegressor(build_fn=model_fn, verbose=0)

    # Hyperparameter search space
    search_space = {
        'learning_rate': (1e-5, 1e-3, 'log-uniform'),
        'lstm_units': (64, 256),
        'dropout_rate': (0.2, 0.5),
        'batch_size': (8, 16),
        'epochs': (3, 10)
    }

    # Bayesian optimization
    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        n_iter=10,
        cv=2,
        scoring='neg_mean_squared_error',
        random_state=42,
        verbose=1
    )

    # Fit and find the best hyperparameters
    bayes_search_result = bayes_search.fit(X, y)

    print(f"Best parameters: {bayes_search_result.best_params_}")
    print(f"Best MSE: {-bayes_search_result.best_score_}")

    return bayes_search_result.best_params_

In [134]:
def plot_training_history(history):
    """Plot training and validation loss."""
    plt.figure(figsize=(8, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(alpha=0.5)
    plt.show()

In [136]:
file_path = '../data/training_set_rel3.tsv' 
data = load_data(file_path)
data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [138]:
data = preprocess_text(data)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenize_essays(data['essay'], tokenizer)

Starting tokenization...


In [None]:
X = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask']}
y = data['domain1_score'].values 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_params = hyperparameter_tuning(X_train, y_train, tokenizer)

In [None]:
final_model = build_bert_lstm_model(
    learning_rate=best_params['learning_rate'],
    lstm_units=best_params['lstm_units'],
    dropout_rate=best_params['dropout_rate']
)

In [None]:
history = final_model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=best_params['batch_size'],
    epochs=best_params['epochs'],
    verbose=1
)

In [None]:
final_model.save('models/bert_lstm_best_model.h5')

In [None]:
y_pred = final_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)
pearson_corr, _ = pearsonr(y_test, y_pred)
qwk = quadratic_weighted_kappa(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Spearman Correlation: {spearman_corr:.2f}")
print(f"Pearson Correlation: {pearson_corr:.2f}")
print(f"Quadratic Weighted Kappa: {qwk:.2f}")

In [None]:
plot_training_history(history)