In [1]:
!nvidia-smi

Sat Mar  2 19:25:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Imports

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install -q accelerate -U
!pip install -q simpletransformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m24.0 

In [5]:
cd '/content/drive/My Drive/Colab Notebooks/NLP'

/content/drive/.shortcut-targets-by-id/1a0AYktucYMqPmaYN7W78vjau1UVtyLYB/NLP


In [6]:
import os
import wandb

# Disable wandb authorization request
os.environ["WANDB_START_METHOD"]="thread"
wandb.init(mode="disabled")



In [7]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from ast import literal_eval
import numpy as np
from dont_patronize_me import DontPatronizeMe
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import torch.functional as F
import tqdm
import data_preprocessing

In [8]:
random_seed = 42

In [9]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

device = 'cuda' if cuda_available else 'cpu'

print('Cuda available?', cuda_available)

Cuda available? True


# Data preprocessing mode

In [10]:
PREPROCESSING_MODE = 'HEAVY' # Choose between BASIC, MEDIUM and HEAVY

In [11]:
def preprocess_data(data):
    data = data_preprocessing.remove_h_tags(data)
    data = data_preprocessing.remove_ampersands(data)
    data = data_preprocessing.remove_mentions(data)
    if PREPROCESSING_MODE != 'BASIC':
        data = data_preprocessing.lowercase(data)
    data = data_preprocessing.remove_contractions(data)
    if PREPROCESSING_MODE == 'HEAVY':
        data = data_preprocessing.remove_multiple_quotations(data)
    data = data_preprocessing.remove_extra_spaces(data)
    return data

# Data loading

In [12]:
def load_data(train_size=0.8, random_state=random_seed):
    dpm = DontPatronizeMe('./data', './data')
    dpm.load_task1()
    trids = pd.read_csv('data/train_semeval_parids-labels.csv')
    teids = pd.read_csv('data/dev_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)
    data = dpm.train_task1_df
    data = preprocess_data(data)

    rows_train_val = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        rows_train_val.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label
        })

    rows_train, rows_val = train_test_split(rows_train_val, train_size=train_size, random_state=random_state)

    rows_test = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        rows_test.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label
        })
    print(len(rows_train), len(rows_val), len(rows_test))

    return pd.DataFrame(rows_train), pd.DataFrame(rows_val), pd.DataFrame(rows_test)

train_dataset_raw, eval_dataset_raw, test_dataset_raw = load_data(train_size=0.8)

  data_no_mentions.at[index, 'text'] = re.sub(r'@([[a-z]|[A-Z]|[1-9]|0])+\s', '', data_no_mentions['text'][index][:5])


6700 1675 2094


In [13]:
train_dataset_raw.head()

Unnamed: 0,par_id,community,text,label
0,5423,disabled,critics have even taken to dobbing in katrina ...,0
1,6890,in-need,alexis and her family decided to donate more t...,0
2,1394,vulnerable,"mr porter , do you think you will get the numb...",1
3,4445,vulnerable,""" this only serves to highlight the importance...",0
4,7991,hopeless,""" i was nervous , but life has taught me over ...",0


# Encode data into a Dataset

In [14]:
# Load the DeBERTa tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

max_length = 192
batch_size = 32

train_text = train_dataset_raw.text.values
eval_text = eval_dataset_raw.text.values
test_text = test_dataset_raw.text.values

encoding_train = tokenizer(train_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_eval = tokenizer(eval_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_test = tokenizer(test_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)

encoding_train['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in train_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_eval['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in eval_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_test['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in test_dataset_raw['label'].tolist()], dtype=torch.float32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [15]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        train_input_ids = self.encodings.input_ids[idx]
        train_token_type_ids = self.encodings.token_type_ids[idx]
        train_attention_mask = self.encodings.attention_mask[idx]
        train_labels = self.encodings.label[idx]
        return {
            'input_ids': train_input_ids,
            'token_type_ids': train_token_type_ids,
            'attention_mask': train_attention_mask,
            'labels': train_labels
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [16]:
# Create an instance of the CustomDataset class
train_dataset = CustomDataset(encoding_train)
eval_dataset = CustomDataset(encoding_eval)
test_dataset = CustomDataset(encoding_test)

# Model loading

In [17]:
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
model = model.to(device)

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
epochs = 10
lr = 1e-5

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = np.argmax(labels, axis=-1)
    f1 = f1_score(labels, predictions, average='binary')
    return {'f1': f1}

trainingargs = TrainingArguments(
    learning_rate=lr,
    weight_decay=1e-2,
    output_dir='/content/training_results',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    report_to=None,
    metric_for_best_model="f1",
    save_strategy='epoch',
    load_best_model_at_end=True,
    seed=random_seed,
    optim='adamw_torch',
)

trainer = Trainer(
    model=model,
    args=trainingargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Training (heavy preprocessing)

In [19]:
# THESE CELLS HAVE BEEN RUN USING PREPROCESSING_MODE = 'HEAVY'
assert PREPROCESSING_MODE == 'HEAVY'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.21427,0.355556
2,No log,0.23787,0.416667
3,0.223100,0.313982,0.258706
4,0.223100,0.339713,0.535385
5,0.098600,0.396193,0.548673
6,0.098600,0.477788,0.524691
7,0.098600,0.506341,0.532508
8,0.029500,0.549736,0.493421
9,0.029500,0.560687,0.48505
10,0.010500,0.569182,0.512821


PredictionOutput(predictions=array([[ 2.6828818, -2.1618726],
       [-2.7284079,  2.9713726],
       [ 2.4795363, -2.1829147],
       ...,
       [ 3.8723516, -3.6909125],
       [ 3.2581713, -2.6808486],
       [ 3.638889 , -3.4687157]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.32404041290283203, 'test_f1': 0.6127450980392157, 'test_runtime': 27.904, 'test_samples_per_second': 75.043, 'test_steps_per_second': 2.365})

In [20]:
trainer.save_model('./results/preprocessing_heavy')

# Training (medium preprocessing)

In [None]:
# THESE CELLS HAVE BEEN RUN USING PREPROCESSING_MODE = 'MEDIUM'
assert PREPROCESSING_MODE == 'MEDIUM'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.212399,0.414634
2,No log,0.267015,0.349057
3,0.215300,0.313006,0.288557
4,0.215300,0.318724,0.559271
5,0.093800,0.376536,0.519737
6,0.093800,0.488184,0.561605
7,0.093800,0.492846,0.5
8,0.026100,0.535754,0.525641
9,0.026100,0.54984,0.530744
10,0.010300,0.555744,0.521452


PredictionOutput(predictions=array([[ 3.0103471, -3.97162  ],
       [-0.9139962,  0.9233532],
       [ 3.2594917, -4.0929203],
       ...,
       [ 3.5459793, -4.537563 ],
       [ 2.210919 , -2.8600724],
       [ 3.5591216, -4.5897427]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.37593376636505127, 'test_f1': 0.6223277909738718, 'test_runtime': 29.2553, 'test_samples_per_second': 71.577, 'test_steps_per_second': 2.256})

In [None]:
trainer.save_model('./results/preprocessing_medium')

# Training (basic preprocessing)

In [None]:
# THESE CELLS HAVE BEEN RUN USING PREPROCESSING_MODE = 'BASIC'
assert PREPROCESSING_MODE == 'BASIC'

trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.211788,0.387931
2,No log,0.269264,0.383562
3,0.208900,0.302541,0.248705
4,0.208900,0.303381,0.542373
5,0.091700,0.403096,0.52459
6,0.091700,0.47353,0.56875
7,0.091700,0.464331,0.55102
8,0.026500,0.504486,0.554217
9,0.026500,0.508914,0.547297
10,0.011500,0.513645,0.536082


PredictionOutput(predictions=array([[ 4.103591 , -3.7324166],
       [-3.245971 ,  3.564775 ],
       [ 4.13931  , -3.5618305],
       ...,
       [ 4.2827106, -3.6968062],
       [ 3.9887986, -3.548049 ],
       [ 4.26543  , -3.870328 ]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.38589534163475037, 'test_f1': 0.6157894736842104, 'test_runtime': 28.3554, 'test_samples_per_second': 73.848, 'test_steps_per_second': 2.328})

In [None]:
trainer.save_model('./results/preprocessing_basic')