In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install sentencepiece

In [None]:
import torch 
import numpy as np
from transformers import AutoTokenizer, AutoModel 
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import gc 
import re 
import torch.nn.functional as F
from tqdm import tqdm
import string
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

In [None]:
# Loading the training dataset from a CSV file into a Pandas DataFrame
train = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
print(train.head(5))
print(train.shape)

In [None]:
test = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
print(test.head(5))
print(test.shape)

In [None]:
num_classes = 6
plt.figure(figsize=(10, 6))
plt.hist(train['score'], bins=num_classes, edgecolor='black')
plt.title('Distribution of Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
folds = 16
train["fold"] = -1
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1000)
for fold, (train_index, val_index) in enumerate(skf.split(train, train["score"])):
    train.loc[val_index, "fold"] = fold

In [None]:
max_len = 0 
max_indx = 0
temp = 0
for text in train.full_text.values: 
    text = text.split() 
    if len(text) > max_len:
        max_len = len(text)
        max_indx = temp 
    temp += 1 
print(f"Max length by whitespace splitting: {max_len}") 
print("-------------LARGEST ESSAY-------------------")
print(train.full_text.values[max_indx])

In [None]:
def process(text):
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text
train['full_text'] = train['full_text'].apply(process)

In [None]:
class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = 0 
        for i in range(self.data.shape[0]):
            text = self.data.loc[i, "full_text"]
            tokens = tokenizer(text, add_special_tokens=True)
            tokens = text.split() 
            self.max_len = max(len(tokens), self.max_len)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, "full_text"]
        tokens = self.tokenizer(text, None, add_special_tokens=True, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        tokens = {k: v.squeeze(0) for k, v in tokens.items()}
        return tokens

In [None]:
def word_to_sentence_embeddings(preds, attention):
    out = preds.last_hidden_state.detach().cpu()
    padding_mask = attention.unsqueeze(-1).expand(out.size()).float() 
    pool = torch.sum(out * padding_mask, 1)/torch.clamp(padding_mask.sum(1), min=1e-9)
    return pool 

In [None]:
def compute_embeddings(model, dataloader): 
    device = 'cuda:0' 
    train_sentence_embeddings = [] 
    for batch in tqdm(dataloader, "computing word embeddings"): 
        real_input = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device) 
                
        with torch.no_grad(): #reduces memory usage by not computing gradients at all 
            with torch.cuda.amp.autocast(): #mixed precision
                preds = model(input_ids=real_input, attention_mask=attention_masks)
        pool = word_to_sentence_embeddings(preds, attention_masks.detach().cpu()) 
        sentence_embeddings = F.normalize(pool, p=2, dim=1)
        sentence_embeddings = sentence_embeddings.squeeze(0).detach().cpu().numpy()
        train_sentence_embeddings.extend(sentence_embeddings)
        
    train_sentence_embeddings = np.array(train_sentence_embeddings)
    
    del preds, sentence_embeddings, real_input, attention_masks
    gc.collect()
    torch.cuda.empty_cache()
    
    return train_sentence_embeddings

In [None]:
train_sentence_embeddings = np.load('/kaggle/input/embeddings-for-aes/train_sentence_embeddings.npy')
test_sentence_embeddings = np.load('/kaggle/input/embeddings-for-aes/test_sentence_embeddings.npy')

In [None]:
all_train_embeds = np.concatenate([train_sentence_embeddings], axis=1)

In [None]:
from sklearn.metrics import cohen_kappa_score

def comp_score(y_true, y_pred):
    m = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return m

class AESModel(nn.Module):
    def __init__(self, input_size, hidden_sizes): 
        '''
        hidden_sizes is an array of integers
        This is a multilayer perceptron taking the sentence embeddings from the previous layer, the sentence embeddings is simply an average over the word embeddings of the Deberta model
        '''
        super(AESModel, self).__init__()
        self.input_size = input_size 
        self.hidden_sizes = hidden_sizes
        
        self.dense1 = nn.Linear(input_size, hidden_sizes[0])
        self.activation = nn.ReLU() 
        self.dense2 = nn.Linear(hidden_sizes[0], hidden_sizes[1]) 
        self.dense3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.dense4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.dense5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.dense6 = nn.Linear(hidden_sizes[4], 6) 
        
    def forward(self, x): 
        x = self.dense1(x)
        x = self.activation(x)
        x = self.dense2(x) 
        x = self.activation(x)
        x = self.dense3(x)
        x = self.activation(x)
        x = self.dense4(x) 
        x = self.activation(x)
        x = self.dense5(x) 
        x = self.activation(x)
        x = self.dense6(x) 
        return x 

def train_model(model, criterion, optimizer, train_loader, num_epochs, x_valid, y_valid):
    pbar = tqdm(range(num_epochs))
    for epoch in pbar:
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_accuracy = correct / total * 100
        
        model.eval()
        with torch.no_grad():
            x_valid = x_valid.to(device)
            preds = torch.argmax(model(x_valid), dim=1)
            score = comp_score(y_valid, (preds + 1).cpu())
            
        epoch_loss = running_loss / len(train_loader.dataset)
        pbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f} => QWK score: {score:.4f} => Accuracy: {train_accuracy:.2f}%")
    return model 

# Note: Make sure to define the 'comp_score' function and ensure 'device' is properly set (e.g., 'cuda' or 'cpu').

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader

# Check if CUDA is available and set device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = 1024       # Input vector dimensionality
hidden_size1 = 3200     # Size of the first hidden layer
hidden_size2 = 3200
hidden_size3 = 1600 # Size of the second hidden layer
output_size = 6         # Number of output classes
learning_rate = 0.01   # Learning rate
num_epochs = 300         # Number of training epochs
batch_size = 128        # Batch size

indices = np.arange(len(all_train_embeds))
train_indices, valid_indices = train_test_split(indices, test_size=0.1, random_state=1000)

X_train = all_train_embeds[train_indices]
y_train = train.loc[train_indices, 'score'].values
X_valid = all_train_embeds[valid_indices]
y_valid = train.loc[valid_indices, 'score'].values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train - 1, dtype=torch.long) 
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.long)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
hidden_sizes = [3200, 3200, 1600, 800, 128] 
model = AESModel(input_size, hidden_sizes) 
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model = train_model(model, criterion, optimizer, train_loader, num_epochs, X_valid_tensor, y_valid_tensor)

In [None]:
all_test_embeds = np.concatenate([test_sentence_embeddings], axis=1)
all_test_embeds_tensor = torch.tensor(all_test_embeds, dtype=torch.float32)
all_test_embeds_tensor = all_test_embeds_tensor.to(device)

all_preds = []

model.eval()
with torch.no_grad():
    test_preds = model(all_test_embeds_tensor).argmax(dim=1).cpu().numpy()
all_preds.append(test_preds + 1) 

sub = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")

sub["score"] = all_preds[0]

sub.score = sub.score.astype('int32')

sub.to_csv("/kaggle/working/submission.csv", index=False)

sub.head()