In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'learning-agency-lab-automated-essay-scoring-2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F71485%2F8059942%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240504%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240504T150225Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5ba605bc59f59a037e9031f1ce104e2cbb98da27cc85dcc109ad0dea4f4f221463cdf268d51a7396d60aa03db436c2f6bd2f0fefcdd461582876b013f6a11d7c27aba4eeff2ef8ecf78f04a7a71207f378d3cdd4ab6871ed3aa2441cc4af1b9fa756e6977fdc52f72fcc7b2937095b56d96892a3d7145afaffaf1fb30682802dd744c15854e3b1ae53572f00b56ab9f588fd339ce1f72fc1fd92a856c7f15b7df04715cd92adcf6d99988e8c1b79d2a9a8ab1fae3707ad36bfc60be9f8646aa0f2944d223519324fc7e89c9d6938b186a2dc9b025b8ec7528996e705e97e591e4470dd13d435800e386039ce77d5769aa7e3e8f4d72354c10e8ff638dd00f1db'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading learning-agency-lab-automated-essay-scoring-2, 12464821 bytes compressed
Downloaded and uncompressed: learning-agency-lab-automated-essay-scoring-2
Data source import complete.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import cohen_kappa_score
import numpy as np

# **Data Loading and Exploration**

In [None]:
data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
data

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [None]:
data['score'].value_counts()

score
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

In [None]:
# Preprocessing: Remove essay_id column
data.drop('essay_id', axis=1, inplace=True)

In [None]:
#data = data.iloc[:1000]
#data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]  # Filter out non-alphabetic tokens and stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize tokens
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the 'full_text' column
data['full_text'] = data['full_text'].apply(preprocess_text)

In [None]:
data

Unnamed: 0,full_text,score
0,many people car live thing know use car alot t...,3
1,scientist nasa discussing face mar explaining ...,3
2,people always wish technology seen movie best ...,4
3,heard venus planet without almost oxygen earth...,4
4,dear state senator letter argue favor keeping ...,3
...,...,...
17302,story challenge exploing venus informative pie...,2
17303,technology changed lot way live today nowadays...,4
17304,like sitting around day great opportunity part...,2
17305,challenge exporing venus author suggests study...,1


In [None]:
# Split data
train_data, val_data = train_test_split(data, test_size=0.2, shuffle=True, stratify = data.score)

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-xsmall")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
# Custom Dataset class for handling text data
class EssayDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['full_text']
        label = self.data.iloc[idx]['score']
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=1024, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Create DataLoader for training and validation sets
train_dataset = EssayDataset(train_data)
val_dataset = EssayDataset(val_data)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-xsmall", num_labels=6)

pytorch_model.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fct = torch.nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

accumulation_steps = 4  # Accumulate gradients over 4 steps
total_steps = len(train_loader) * accumulation_steps

for epoch in range(1):
    model.train()
    optimizer.zero_grad()
    step = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        mapped_labels = labels - 1

        outputs = model(input_ids, attention_mask=attention_mask, labels=mapped_labels)
        loss = outputs.loss

        # Perform backpropagation for gradient accumulation
        loss = loss / accumulation_steps
        loss.backward()

        step += 1
        if step % accumulation_steps == 0 or step == total_steps:
            optimizer.step()
            optimizer.zero_grad()

    # Validation
    model.eval()
    val_losses = []
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            mapped_labels = labels - 1

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            val_loss = loss_fct(logits, mapped_labels)

            val_losses.append(val_loss.item())
            val_preds.extend(logits.argmax(dim=1).cpu().numpy())
            val_labels.extend(mapped_labels.cpu().numpy())

    # Calculate validation metrics
    val_loss_mean = np.mean(val_losses)
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    val_conf_matrix = confusion_matrix(val_labels, val_preds)

    print(f"Epoch {epoch + 1} | Validation Loss: {val_loss_mean:.4f} | Validation F1 Score: {val_f1:.4f}")
    print("Confusion Matrix:")
    print(val_conf_matrix)

Epoch 1 | Validation Loss: 1.0204 | Validation F1 Score: 0.5092
Confusion Matrix:
[[  0 228  21   2   0   0]
 [  0 790 149   5   1   0]
 [  0 450 730  74   2   0]
 [  0  21 421 328  15   0]
 [  0   0  14 139  41   0]
 [  0   0   0   6  25   0]]
