In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, Trainer, TrainingArguments, AdamW, AutoModelForSequenceClassification, DataCollatorWithPadding, AddedToken

In [None]:
data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
data

In [None]:
# Remove essay_id column
data.drop('essay_id', axis=1, inplace=True)

In [None]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

In [None]:
# import zipfile

# # Unzip wordnet.zip
# zip_file_path = '/usr/share/nltk_data/corpora/wordnet.zip'
# extract_to_path = '/usr/share/nltk_data/corpora/'

# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_to_path)

# print("Unzipping completed successfully.")

In [None]:
# def preprocess_text(text):
#     tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
#     tokens = [token for token in tokens if token.isalpha() and token not in stop_words]  # Filter out non-alphabetic tokens and stopwords
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize tokens
#     return ' '.join(lemmatized_tokens)

# # Apply preprocessing to the 'full_text' column
# data['full_text'] = data['full_text'].apply(preprocess_text)

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [None]:
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])

In [None]:
from transformers import AutoModel
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large",num_labels=6)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Tokenize and encode the data
def tokenize_data(text_list):
    return tokenizer(text_list, truncation=True, padding=True, max_length=1024, return_tensors='pt')

In [None]:
# Split data into training and testing sets
train_texts, test_texts, y_train, y_test = train_test_split(data['full_text'], data['score'], test_size=0.2, random_state=42, stratify = data.score)

In [None]:
# Tokenize the texts
train_encodings = tokenize_data(train_texts.tolist())
test_encodings = tokenize_data(test_texts.tolist())

In [None]:
# Verify and adjust label values
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

In [None]:
# If labels are out of range, apply necessary adjustments (if needed)
num_labels = 6
train_labels = torch.clamp(train_labels, 0, num_labels - 1)
test_labels = torch.clamp(test_labels, 0, num_labels - 1)

In [None]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create Dataset objects
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
# Custom Trainer class for OLL
class OLL2Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        num_classes = model.config.num_labels
        dist_matrix = torch.arange(num_classes).unsqueeze(0) - torch.arange(num_classes).unsqueeze(1)
        dist_matrix = dist_matrix ** 2  # Square distances
        dist_matrix = dist_matrix.float().to(model.device)
        
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits
        probas = F.softmax(logits, dim=1)
        true_labels = [num_classes * [labels[k].item()] for k in range(len(labels))]
        label_ids = len(labels) * [[k for k in range(num_classes)]]
        distances = [[float(dist_matrix[true_labels[j][i]][label_ids[j][i]]) for i in range(num_classes)] for j in range(len(labels))]
        distances_tensor = torch.tensor(distances, device=model.device, requires_grad=True)
        err = -torch.log(1 - probas) * abs(distances_tensor)
        loss = torch.sum(err, axis=1).mean()
        return (loss, outputs) if return_outputs else loss

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/trained_deberta_model',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_bnb_8bit",
    learning_rate = 2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    do_eval=True,
    eval_steps=100,
    save_total_limit=1,
    save_steps = 100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none"
)

In [None]:
# Trainer instance
trainer = OLL2Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# Train and evaluate the model, logging train and validation loss
def train_and_evaluate(trainer):
    for epoch in range(int(training_args.num_train_epochs)):
        train_result = trainer.train()
        eval_result = trainer.evaluate()
        train_loss = train_result.training_loss
        eval_loss = eval_result["eval_loss"]
        print(f"Epoch {epoch + 1}: Training Loss: {train_loss}, Validation Loss: {eval_loss}")

train_and_evaluate(trainer)

In [None]:
from numba import jit
@jit(nopython=True)
def cpmp_qwk(a1, a2, max_rat=3) -> float:
    """
    A ultra fast implementation of Quadratic Weighted Kappa (QWK)
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133
    
    :param a1: The ground truth labels
    :param a2: The predicted labels
    :param max_rat: The maximum target value
    
    return: A floating point number with the QWK score
    """
    assert(len(a1) == len(a2))
    
    # Convert to numpy arrays with explicit dtype
    a1 = np.asarray(a1, dtype=np.int32)
    a2 = np.asarray(a2, dtype=np.int32)

    hist1 = np.zeros((max_rat + 1,), dtype=np.int32)
    hist2 = np.zeros((max_rat + 1,), dtype=np.int32)

    o = 0.0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o += (i - j) * (i - j)

    e = 0.0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1.0 - o / e

In [None]:
# Get predictions on the validation set
def evaluate_model(trainer, eval_dataloader):
    all_preds = []
    all_labels = []
    for batch in eval_dataloader:
        batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = trainer.model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())
    return all_labels, all_preds

# Compute and print QWK score
eval_dataloader = trainer.get_eval_dataloader()
true_labels, pred_labels = evaluate_model(trainer, eval_dataloader)
qwk_score = cpmp_qwk(true_labels, pred_labels)
print(f"Quadratic Weighted Kappa score: {qwk_score}")

In [None]:
tokenizer.save_pretrained('/kaggle/working/trained_deberta_model')