In [None]:
!nvidia-smi

Sat Mar  2 19:24:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Imports

In [1]:
!pip install -q accelerate -U
!pip install -q simpletransformers

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
cd '/content/drive/My Drive/Colab Notebooks/NLP'

/content/drive/My Drive/Colab Notebooks/NLP


In [5]:
import os
import wandb

# Disable wandb authorization request
os.environ["WANDB_START_METHOD"]="thread"
wandb.init(mode="disabled")



In [6]:
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from ast import literal_eval
import numpy as np
from dont_patronize_me import DontPatronizeMe
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import torch.functional as F
import tqdm

In [7]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

device = 'cuda' if cuda_available else 'cpu'

print('Cuda available?', cuda_available)

Cuda available? True


In [8]:
max_length = 192
batch_size = 32
random_seed = 42

# Loading mode

In [9]:
LOADING_MODE = 'k' # mode can be 'c', 'k', 'ck', ''

# Data loading

In [10]:
def get_rows(dataframe, data):
    rows = [] # will contain par_id, label and text
    for idx in range(len(dataframe)):
        parid = dataframe.par_id[idx]
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        country = instance.country.values[0]
        text = instance.text.values[0]
        if LOADING_MODE == 'c':
            text = country + ' | ' + text
        elif LOADING_MODE == 'k':
            text = keyword + ' | ' + text
        elif LOADING_MODE == 'ck':
            text = country + ' | ' + keyword + ' | ' + text
        rows.append({
            'par_id':parid,
            'text':text,
            'label':instance.label.values[0]
        })
    return rows


def load_data(train_size=0.8, random_state=random_seed):
    dpm = DontPatronizeMe('./data', './data')
    dpm.load_task1()
    trids = pd.read_csv('data/train_semeval_parids-labels.csv')
    teids = pd.read_csv('data/dev_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)
    data = dpm.train_task1_df

    rows_train_val = get_rows(trids, data)
    rows_test = get_rows(teids, data)

    rows_train, rows_val = train_test_split(rows_train_val, train_size=train_size, random_state=random_state)

    print(len(rows_train), len(rows_val), len(rows_test))

    return pd.DataFrame(rows_train), pd.DataFrame(rows_val), pd.DataFrame(rows_test)


train_dataset_raw, eval_dataset_raw, test_dataset_raw = load_data(train_size=0.8)

6700 1675 2094


# Encode data into a Dataset

In [11]:
# Load the DeBERTa tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

max_length = 192
batch_size = 32

train_text = train_dataset_raw.text.values
eval_text = eval_dataset_raw.text.values
test_text = test_dataset_raw.text.values

encoding_train = tokenizer(train_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_eval = tokenizer(eval_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_test = tokenizer(test_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)

encoding_train['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in train_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_eval['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in eval_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_test['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in test_dataset_raw['label'].tolist()], dtype=torch.float32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        train_input_ids = self.encodings.input_ids[idx]
        train_token_type_ids = self.encodings.token_type_ids[idx]
        train_attention_mask = self.encodings.attention_mask[idx]
        train_labels = self.encodings.label[idx]
        return {
            'input_ids': train_input_ids,
            'token_type_ids': train_token_type_ids,
            'attention_mask': train_attention_mask,
            'labels': train_labels
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
# Create an instance of the CustomDataset class
train_dataset = CustomDataset(encoding_train)
eval_dataset = CustomDataset(encoding_eval)
test_dataset = CustomDataset(encoding_test)

# Model loading

In [14]:
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
model = model.to(device)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
epochs = 10
lr = 1e-5

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = np.argmax(labels, axis=-1)
    f1 = f1_score(labels, predictions, average='binary')
    return {'f1': f1}

trainingargs = TrainingArguments(
    learning_rate=lr,
    weight_decay=1e-2,
    output_dir='/content/training_results',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    report_to=None,
    metric_for_best_model="f1",
    save_strategy='epoch',
    load_best_model_at_end=True,
    seed=random_seed,
    optim='adamw_torch',
)

trainer = Trainer(
    model=model,
    args=trainingargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Model loading and training (country + keyword + text)

In [None]:
# THESE CELLS HAVE BEEN RUN USING LOADING_MODE = 'ck'
assert LOADING_MODE == 'ck'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.220607,0.368421
2,No log,0.239641,0.394737
3,0.224500,0.31235,0.246154
4,0.224500,0.292776,0.56051
5,0.095000,0.376192,0.472325
6,0.095000,0.427947,0.570588
7,0.095000,0.452766,0.539474
8,0.029900,0.490229,0.528428
9,0.029900,0.512304,0.5
10,0.011200,0.523148,0.501767


PredictionOutput(predictions=array([[ 2.8297489, -2.512269 ],
       [-2.5974827,  2.1508582],
       [ 3.4445553, -3.0650253],
       ...,
       [ 3.9104006, -3.5750153],
       [ 3.9184618, -3.4718604],
       [ 3.9697366, -3.6238687]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.35620713233947754, 'test_f1': 0.6201923076923077, 'test_runtime': 29.2842, 'test_samples_per_second': 71.506, 'test_steps_per_second': 2.254})

In [None]:
trainer.save_model('./results/country_keyword_text')

# Model loading and training (keyword + text)

In [16]:
# THESE CELLS HAVE BEEN RUN USING LOADING_MODE = 'k'
assert LOADING_MODE == 'k'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.212646,0.440299
2,No log,0.240131,0.446352
3,0.206500,0.315793,0.281407
4,0.206500,0.314972,0.582822
5,0.089100,0.397497,0.531915
6,0.089100,0.442892,0.573134
7,0.089100,0.472473,0.537102
8,0.026300,0.503671,0.533762
9,0.026300,0.524377,0.529032
10,0.013700,0.53001,0.537217


Checkpoint destination directory /content/training_results/checkpoint-210 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-420 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-630 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-840 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-1050 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-1260 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory

PredictionOutput(predictions=array([[ 1.1266661, -1.0299623],
       [-2.291632 ,  2.275976 ],
       [ 2.7459142, -2.7911124],
       ...,
       [ 2.8427079, -2.972321 ],
       [ 2.6355867, -2.8945975],
       [ 3.381178 , -3.3029482]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.2695818543434143, 'test_f1': 0.5956416464891041, 'test_runtime': 29.296, 'test_samples_per_second': 71.477, 'test_steps_per_second': 2.253})

In [17]:
trainer.save_model('./results/keyword_text')

# Model loading and training (country + text)

In [16]:
# THESE CELLS HAVE BEEN RUN USING LOADING_MODE = 'c'
assert LOADING_MODE == 'c'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.224099,0.366667
2,No log,0.275583,0.374429
3,0.238800,0.364687,0.132597
4,0.238800,0.320067,0.516556
5,0.111700,0.428977,0.413793
6,0.111700,0.46377,0.460481
7,0.111700,0.478384,0.448669
8,0.042900,0.522087,0.473868
9,0.042900,0.544582,0.454545
10,0.015000,0.549705,0.463768


PredictionOutput(predictions=array([[ 2.0092242, -1.7410232],
       [-1.4642643,  1.6755875],
       [ 2.9298952, -2.562678 ],
       ...,
       [ 3.3583739, -3.1843789],
       [ 2.6277986, -2.431779 ],
       [ 3.2849154, -3.2131083]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.244293674826622, 'test_f1': 0.5945945945945945, 'test_runtime': 29.2735, 'test_samples_per_second': 71.532, 'test_steps_per_second': 2.255})

In [17]:
trainer.save_model('./results/country_text')