In [17]:
!pip install transformers
!pip install accelerate
# TODO: check lr scheduler



In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score as kappa
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2Model
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import os
import time
import pathlib

# log folder to save log files
log_folder = '/content/drive/MyDrive/asap/'

# target column
target_column = "score"

# hyper parameters
hp = {
    "base_model": "gpt2",
    "lr": 1e-4,
    "num_epochs": 30,
    "batch_size":1,
    "use_amp": True,
    "mixed_precision": "fp16",
}

# Prepare ASAP Dataset

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# Original kaggle training set
kaggle_dataset  = pd.read_csv('/content/drive/MyDrive/asap-aes/training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")

# Smaller training set used for this project
dataset_df = pd.DataFrame(
  {
    'essay_id' : kaggle_dataset['essay_id'],
    'essay_set' : kaggle_dataset['essay_set'],
    'essay' : kaggle_dataset['essay'],
    'rater1' : kaggle_dataset['rater1_domain1'],
    'rater2' : kaggle_dataset['rater2_domain1'],
    'score' : kaggle_dataset['domain1_score']
  })
dataset_df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1,rater2,score
0,1,1,"Dear local newspaper, I think effects computer...",4,4,8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,10
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,8


## Use essay_set=7 for classification

In [28]:
essay_df = dataset_df[dataset_df['essay_set'] == 7].copy()
essay_df.shape

# essay_df = dataset_df.loc[(dataset_df['essay_set'] == 3) | (dataset_df['essay_set'] == 4) | (dataset_df['essay_set'] == 5) | (dataset_df['essay_set'] == 6)].copy()
# essay_df.shape

(1569, 6)

In [29]:
essay_df['score'].value_counts()

score
16    199
17    160
18    118
14    105
20     99
24     96
19     88
12     86
15     85
13     82
21     68
22     62
11     56
10     55
23     53
8      50
9      49
7      28
6      20
4       4
5       4
2       1
3       1
Name: count, dtype: int64

In [30]:
#Map the classes to new values in order.
unique_scores = sorted(essay_df['score'].unique())
mapping = {score: i+1 for i, score in enumerate(unique_scores)}

#Update the dataset
essay_df['score'] = essay_df['score'].map(mapping)

# Check the recount.
print(essay_df['score'].value_counts().sort_index())


score
1       1
2       1
3       4
4       4
5      20
6      28
7      50
8      49
9      55
10     56
11     86
12     82
13    105
14     85
15    199
16    160
17    118
18     88
19     99
20     68
21     62
22     53
23     96
Name: count, dtype: int64


In [31]:
!pip install googletrans==4.0.0-rc1




In [32]:
from googletrans import Translator
import pandas as pd

def back_translate(text):
    translator = Translator()
    target_language = 'tr'

    try:
        translated_text = translator.translate(text, dest=target_language).text
        back_translated_text = translator.translate(translated_text, dest='en').text
        return back_translated_text
    except Exception as e:
        print(f"Çeviri hatası: {e}")
        return text  # Hata durumunda orijinal metni kullanın


def augment_data(df, column, value, fraction):
    subset = df[df[column] == value]
    augmented_subset = subset.sample(frac=fraction, replace=True)

    new_rows = []
    max_essay_id = df['essay_id'].max()

    for _, row in augmented_subset.iterrows():
        max_essay_id += 1
        new_rows.append({
            'essay_id': max_essay_id,
            'essay_set': row['essay_set'],
            'essay': back_translate(row['essay']),
            'rater1': row['rater1'],
            'rater2': row['rater2'],
            'score': value
        })

    return pd.DataFrame(new_rows)

# essay_df üzerine veri artırma uygulama
classes_to_augment = [1, 2, 3, 4, 5, 6, 8]
augmented_dataframes = []

for value in classes_to_augment:
    augmented_dataframes.append(augment_data(essay_df, 'score', value, 1))

augmented_df = pd.concat(augmented_dataframes, ignore_index=True)

#Artırılmış veriyi orijinal veriye ekleyin
essay_df = pd.concat([essay_df, augmented_df], ignore_index=True)

print(f"Total data size after augmentation: {len(essay_df)}")

Çeviri hatası: 'NoneType' object is not iterable
Total data size after augmentation: 1676


In [33]:
essay_df['score'].value_counts()


score
15    199
16    160
17    118
13    105
19     99
8      98
23     96
18     88
11     86
14     85
12     82
20     68
21     62
10     56
6      56
9      55
22     53
7      50
5      40
3       8
4       8
1       2
2       2
Name: count, dtype: int64

In [34]:
# 6'dan düşük skorları 6'ya yükseltme
essay_df['score'] = essay_df['score'].apply(lambda x: 5 if x < 5 else x)
# Yeni frekansları hesaplama
new_frequencies = essay_df['score'].value_counts().sort_index()

# Sonuçları gösterme
print(new_frequencies)

score
5      60
6      56
7      50
8      98
9      55
10     56
11     86
12     82
13    105
14     85
15    199
16    160
17    118
18     88
19     99
20     68
21     62
22     53
23     96
Name: count, dtype: int64


In [35]:
from sklearn.preprocessing import StandardScaler
essay_df['target_score'] = essay_df[target_column] #- essay_df[target_column].min()

#Standardize the data
scaler = StandardScaler()
essay_df[['score']] = scaler.fit_transform(essay_df[['score']])


In [36]:
from sklearn.model_selection import train_test_split

essay_df['target_score'] = essay_df[target_column] - essay_df[target_column].min()

X, y = essay_df['essay'].to_list(), essay_df['target_score'].to_numpy()
num_labels = essay_df[target_column].unique().size

# 60 / 40 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42) # stratify=y, this paramter will not work if any class has number of examples lower than 2

# split test to half to get 60 / 20 / 20 split

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.50, random_state=42) # stratify=y_test

In [37]:
import torch
from torch.utils.data import Dataset

class EssayDataset(Dataset):
    def __init__(self, essays, targets, tokenizer, device):
        self.essays = essays
        self.targets = targets
        self.tokenizer = tokenizer
        self.device = device

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, idx):
        text = str(self.essays[idx])
        # Ensure that the tokenizer and device are used correctly
        encoded_input = self.tokenizer(text, truncation=True, return_tensors='pt')
        encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}

        # It's better to ensure the target is a tensor and on the correct device
        target = torch.tensor(self.targets[idx], dtype=torch.long).to(self.device)

        return encoded_input['input_ids'].squeeze(), encoded_input['attention_mask'].squeeze(), target


In [38]:
# collater function to pad tokens
def collate_fn(batch):
    PAD_TOKEN_ID = 50256 # Use tokenizer.pad_token_id to check
    input_ids_list, attention_mask_list, targets = [], [], []

    for input_ids, attention_mask, target in batch:
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        targets.append(target)

    # Pad the batch to the maximum sequence length within that batch using the tokenizer's pad token
    max_length = max(len(ids) for ids in input_ids_list)
    padded_input_ids = []
    padded_attention_mask = []

    for input_ids, attention_mask in zip(input_ids_list, attention_mask_list):
        pad_length = max_length - len(input_ids)
        padded_input_ids.append(torch.cat([input_ids, torch.tensor([PAD_TOKEN_ID] * pad_length, device=device, dtype=torch.long)]))
        # add zeros to attention mask for padds
        padded_attention_mask.append(torch.cat([attention_mask, torch.zeros(pad_length, dtype=torch.long, device=device)]))

    return torch.stack(padded_input_ids), torch.stack(padded_attention_mask), torch.tensor(targets)

In [39]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(hp['base_model'])
tokenizer.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [40]:
from transformers import GPT2ForSequenceClassification

class ClassifierLayer(torch.nn.Module):

  def __init__(self, input_size, output_size, bias=False):
    super(ClassifierLayer, self).__init__()

    self.dropout = torch.nn.Dropout(0.1)
    self.linear = torch.nn.Linear(input_size, output_size, bias=bias)

  def forward(self, x):
    inputs = self.dropout(x)
    return self.linear(inputs)

class GPT2Classification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.score = ClassifierLayer(config.n_embd, self.num_labels, bias=False)

        self.post_init()

In [41]:
from accelerate import Accelerator

# use fp16 mixed precision to improve training speed
accelerator = Accelerator(mixed_precision=hp['mixed_precision']) # fp16
device = accelerator.device

model = GPT2Classification.from_pretrained(hp['base_model'], num_labels=num_labels)
#model = GPT2ForSequenceClassification.from_pretrained(hp['base_model'], num_labels=num_labels)
model.to(device)
tokenizer.pad_token_id = tokenizer.eos_token_id
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2Classification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
from sklearn.metrics import classification_report
# train loop

def train_loop(model, train_loader, val_loader, loss_fct, optimizer, lr_scheduler, progress_bar, log_file_handler, logging_step=1, use_amp=False):
    samples = 0.
    cumulative_loss = 0.

    # set model to train mode
    model.train()

    for step, (inputs, attention_masks, targets) in enumerate(train_loader):
        targets = targets.reshape(-1, 1).to(device)
        attention_masks = attention_masks.to(device)
        outputs = model(inputs, attention_mask=attention_masks)
        loss = loss_fct(outputs["logits"].view(-1, model.num_labels), targets.view(-1))
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        samples += inputs.shape[0]
        cumulative_loss += loss.item()

        if step % logging_step == 0:
            # calculate cls_report on test set

            with torch.no_grad():
                test_loss, test_preds, cls_report = test_loop(model, val_loader, loss_fct, use_amp=use_amp)
            model.train()
            log_str = "Step: {:<6} \t Train loss: {:<6.4f} \t Validation loss: {:<6.4f}".format(step, (cumulative_loss/samples), test_loss)
            # Adding f1 accuracy recall prec metics
            log_str += "\n" + cls_report
            print(log_str)
            log_file_handler.write(log_str + "\n")
            samples = 0
            cumulative_loss = 0

    return cumulative_loss/samples if samples != 0 else float("inf")

def test_loop(model, test_loader, loss_fct, use_amp=False, show_progression=False):
    samples = 0.
    cumulative_loss = 0.
    preds = []
    labels = []

    # set model to eval mode
    model.eval()

    loop_iterator = enumerate(tqdm(test_loader)) if show_progression else enumerate(test_loader)

    with torch.no_grad():
        for step, (inputs, attention_masks, targets) in loop_iterator:
            targets = targets.reshape(-1, 1).to(device)
            inputs = inputs.to(device)
            attention_masks = attention_masks.to(device)
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(inputs, attention_mask=attention_masks)
            loss = loss_fct(outputs["logits"].view(-1, model.num_labels), targets.view(-1))

            samples += inputs.shape[0]
            cumulative_loss += loss.item()

            probs = outputs['logits'].softmax(-1) # probs
            predictions = probs.argmax(-1) # predicted classes

            labels.extend(targets.tolist())
            preds.extend(predictions.tolist())

        cls_report = classification_report(labels, preds, zero_division=0)

    return cumulative_loss/samples if samples != 0 else float("inf"), np.asarray(preds, dtype=np.float32), cls_report



In [44]:
from tqdm.auto  import tqdm
from transformers import get_scheduler
from torch.optim import AdamW
from sklearn.utils import class_weight

lr = hp['lr']
num_epochs = hp['num_epochs']
batch_size = hp['batch_size']
use_amp = hp['use_amp']

# Assuming that 'accelerator' is already imported and initialized
device = accelerator.device  # Using the device associated with the accelerator

# Assuming that X_train, y_train, tokenizer are already defined and properly set up
# Create training dataset and DataLoader
train_dataset = EssayDataset(X_train, y_train, tokenizer, device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
num_training_steps = num_epochs * len(train_loader)

# create test data loader
test_dataset = EssayDataset(X_test, y_test, tokenizer, device)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# create val data loader
val_dataset = EssayDataset(X_val, y_val, tokenizer, device)
val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# # get class weights
# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

loss_fct = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer,
    num_warmup_steps=0, num_training_steps=num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# use accelerator prepare

# removed val_loader from prepare command
model, optimizer, train_loader, test_loader, val_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_loader, test_loader, val_loader, lr_scheduler
)

In [45]:
def open_log_file(log_folder, essay_df, model, label_column: str, hyper_parameters):
    # using time as a file name for logging

    timestr = time.strftime("%Y%m%d-%H%M%S")
    file_name = timestr + '.log'

    # check if folder exists, create if it isn't
    pathlib.Path(log_folder).mkdir(exist_ok=True, parents=True)

    # open file to log results
    log_file = os.path.join(log_folder, file_name)
    fp = open(log_file, "a")

    fp.write("Log time: " + timestr + "\n")
    fp.write("Essay classes: " + str(essay_df['essay_set'].unique()) + "\n")
    fp.write("Using score column: " + label_column + "\n")
    fp.write("Score distribution: " + "\n" + essay_df[label_column].value_counts().to_string() + "\n")

    fp.write("\n--- Model parameters:\n")
    fp.write(str(model))
    fp.write('\n')

    fp.write("\n--- Hyper parameters:\n")
    for k, v in hyper_parameters.items():
        fp.write(f" {k:<25}: {v}\n")

    fp.write('\n')
    fp.flush()
    return fp


In [None]:
!nvidia-smi


In [46]:
import torch
print(torch.cuda.is_available())


True


In [None]:
# Start logging to a file
fp = open_log_file(log_folder, essay_df, model, label_column=target_column, hyper_parameters=hp)

try:
    # Start training
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    with accelerator.autocast():
        fp.write("Training logs: \n\n")
        for epoch in range(num_epochs):
            train_loss = train_loop(model, train_loader, val_loader, loss_fct, optimizer, lr_scheduler, progress_bar, fp, logging_step=len(train_loader)//3, use_amp=hp['use_amp'])
            with torch.no_grad():
                test_loss, test_preds, cls_report = test_loop(model, test_loader, loss_fct)
                log_string = "Epoch: {:<6}\t Test  loss: {:<6.4f}".format(epoch+1, test_loss)
                log_string += "\n" + cls_report
                print(log_string)
                fp.write(log_string + "\n")
finally:
    print("Log file closed.")
    fp.close()

  0%|          | 0/30150 [00:00<?, ?it/s]

Step: 0      	 Train loss: 0.1755 	 Validation loss: 9.1330
              precision    recall  f1-score   support

           0       0.13      0.18      0.15        66
           1       0.00      0.00      0.00        92
           2       0.00      0.00      0.00       129
           3       0.00      0.00      0.00        48
           9       0.00      0.00      0.00         0

    accuracy                           0.04       335
   macro avg       0.03      0.04      0.03       335
weighted avg       0.03      0.04      0.03       335

Step: 335    	 Train loss: 4.2653 	 Validation loss: 2.6919
              precision    recall  f1-score   support

           0       0.22      0.77      0.34        66
           1       0.29      0.07      0.11        92
           2       0.67      0.02      0.03       129
           3       0.23      0.19      0.21        48
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10

In [None]:
model