In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/MyDrive/Colab Notebooks/NLP'

/content/drive/MyDrive/Colab Notebooks/NLP


In [3]:
!pip install -q accelerate -U
!pip install -q nlpaug simpletransformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m51.7 

In [4]:
import os
import wandb
import pandas as pd
import logging
import torch
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
from dont_patronize_me import DontPatronizeMe
from torch.utils.data import Dataset
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import data_analysis_and_preprocessing.data_preprocessing as data_preprocessing
import nlpaug.augmenter.word as naw

# Disable wandb authorization request
os.environ["WANDB_START_METHOD"]="thread"
wandb.init(mode="disabled")



In [5]:
random_seed = 42
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

device = 'cuda' if cuda_available else 'cpu'

print('Cuda available?', cuda_available)

PREPROCESSING_MODE = 'BASIC' # Choose between BASIC, MEDIUM and HEAVY
LOADING_MODE = 'k' # mode can be 'c', 'k', 'ck', ''

Cuda available? True


In [6]:
def preprocess_data(data):
    data = data_preprocessing.remove_h_tags(data)
    data = data_preprocessing.remove_ampersands(data)
    data = data_preprocessing.remove_mentions(data)
    if PREPROCESSING_MODE != 'BASIC':
        data = data_preprocessing.lowercase(data)
    data = data_preprocessing.remove_contractions(data)
    if PREPROCESSING_MODE == 'HEAVY':
        data = data_preprocessing.remove_multiple_quotations(data)
    data = data_preprocessing.remove_extra_spaces(data)
    return data


def get_rows(data):
    rows = [] # will contain par_id, label and text
    for idx in data.index:
        parid = data.loc[idx]['par_id']
        instance = data.loc[idx]
        keyword = instance['keyword']
        country = instance['country']
        text = instance['text']
        if LOADING_MODE == 'c':
            text = country + ' | ' + text
        elif LOADING_MODE == 'k':
            text = keyword + ' | ' + text
        elif LOADING_MODE == 'ck':
            text = country + ' | ' + keyword + ' | ' + text
        rows.append({
            'par_id':parid,
            'text':text,
            'label':instance['label']
        })
    return rows


def load_data(train_size=0.8, random_state=random_seed):
    dpm = DontPatronizeMe('./data', './data')
    dpm.load_task1()
    trids = pd.read_csv('data/train_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids = pd.read_csv('data/dev_semeval_parids-labels.csv')
    teids.par_id = teids.par_id.astype(str)
    data = dpm.train_task1_df

    dpm = DontPatronizeMe('./data', './data')
    dpm.load_task1()
    trids = pd.read_csv('data/train_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)
    data = dpm.train_task1_df

    rows_train_val = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        country = instance.country.values[0]
        rows_train_val.append({
            'par_id':parid,
            'keyword':keyword,
            'country':country,
            'text':text,
            'label':label
        })

    rows_train, rows_val = train_test_split(rows_train_val, train_size=train_size, random_state=random_state)
    train_set = pd.DataFrame(rows_train)
    val_set = pd.DataFrame(rows_val)
    train_set = preprocess_data(train_set)
    val_set = preprocess_data(val_set)

    # reindex the train_set
    train_set = train_set.reset_index(drop=True)

    rows_train = get_rows(train_set)
    rows_val = get_rows(val_set)

    train_dataset_raw = pd.DataFrame(rows_train)

    train_dataframe, dev_dataframe = pd.DataFrame(rows_train), pd.DataFrame(rows_val)

    rows_test = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        country = instance.country.values[0]
        rows_test.append({
            'par_id':parid,
            'keyword':keyword,
            'country':country,
            'text':text,
            'label':label
        })

    test_set = pd.DataFrame(rows_test)
    test_set = preprocess_data(test_set)

    rows_test = get_rows(test_set)
    test_dataframe = pd.DataFrame(rows_test)

    print(len(train_dataframe), len(dev_dataframe), len(test_dataframe))

    return train_dataframe, dev_dataframe, test_dataframe



class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        train_input_ids = self.encodings.input_ids[idx]
        train_token_type_ids = self.encodings.token_type_ids[idx]
        train_attention_mask = self.encodings.attention_mask[idx]
        train_labels = self.encodings.label[idx]
        return {
            'input_ids': train_input_ids,
            'token_type_ids': train_token_type_ids,
            'attention_mask': train_attention_mask,
            'labels': train_labels
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [7]:
train_dataset_raw, eval_dataset_raw, test_dataset_raw = load_data(train_size=0.8)

  data_no_mentions.at[index, 'text'] = re.sub(r'@([[a-z]|[A-Z]|[1-9]|0])+\s', '', data_no_mentions['text'][index][:5])


6700 1675 2094


In [8]:
train_dataset_raw.head()

Unnamed: 0,par_id,text,label
0,5423,disabled | Critics have even taken to dobbing ...,0
1,6890,in-need | Alexis and her family decided to don...,0
2,1394,"vulnerable | Mr Porter , do you think you will...",1
3,4445,"vulnerable | """""" This only serves to highlight...",0
4,7991,"hopeless | """""" I was nervous , but life has ta...",0


In [9]:
# Load the DeBERTa tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

train_text = train_dataset_raw.text.values
eval_text = eval_dataset_raw.text.values
test_text = test_dataset_raw.text.values

max_length = 192

encoding_train = tokenizer(train_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_eval = tokenizer(eval_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_test = tokenizer(test_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)

encoding_train['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in train_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_eval['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in eval_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_test['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in test_dataset_raw['label'].tolist()], dtype=torch.float32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [10]:
# Create an instance of the CustomDataset class
train_dataset = CustomDataset(encoding_train)
eval_dataset = CustomDataset(encoding_eval)
test_dataset = CustomDataset(encoding_test)

In [11]:
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
model = model.to(device)

epochs = 8
lr = 1e-5
batch_size = 32

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = np.argmax(labels, axis=-1)
    f1 = f1_score(labels, predictions, average='binary')
    return {'f1': f1}

trainingargs = TrainingArguments(
    learning_rate=lr,
    weight_decay=1e-2,
    output_dir='training_results',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    report_to=None,
    metric_for_best_model="f1",
    save_strategy='epoch',
    load_best_model_at_end=True,
    seed=random_seed,
    optim='adamw_torch',
)

trainer = Trainer(
    model=model,
    args=trainingargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.216064,0.455516
2,No log,0.24426,0.415584
3,0.208200,0.307628,0.280193
4,0.208200,0.30097,0.591045
5,0.091300,0.378471,0.508711
6,0.091300,0.420289,0.55414
7,0.091300,0.458202,0.532423
8,0.032200,0.472286,0.528814


PredictionOutput(predictions=array([[ 0.55452704, -0.7930232 ],
       [-1.7411203 ,  1.8877645 ],
       [ 2.8376656 , -3.1553912 ],
       ...,
       [ 2.7258143 , -3.060103  ],
       [ 2.7767382 , -3.1395557 ],
       [ 3.121179  , -3.274678  ]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.2365959882736206, 'test_f1': 0.6277372262773723, 'test_runtime': 28.1852, 'test_samples_per_second': 74.294, 'test_steps_per_second': 2.342})

In [14]:
trainer.save_model('./results/keyword_preproc')