In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd '/content/drive/MyDrive/Colab Notebooks/NLP'

/content/drive/.shortcut-targets-by-id/1a0AYktucYMqPmaYN7W78vjau1UVtyLYB/NLP


In [3]:
!pip install -q accelerate -U
!pip install -q nlpaug simpletransformers

In [4]:
import os
import wandb
import pandas as pd
import logging
import torch
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
from dont_patronize_me import DontPatronizeMe
from torch.utils.data import Dataset
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import data_analysis_and_preprocessing.data_preprocessing as data_preprocessing
import nlpaug.augmenter.word as naw

# Disable wandb authorization request
os.environ["WANDB_START_METHOD"]="thread"
wandb.init(mode="disabled")



In [5]:
random_seed = 42
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

device = 'cuda' if cuda_available else 'cpu'

print('Cuda available?', cuda_available)

PREPROCESSING_MODE = 'BASIC' # Choose between BASIC, MEDIUM and HEAVY
LOADING_MODE = 'k' # mode can be 'c', 'k', 'ck', ''

Cuda available? True


In [6]:
def preprocess_data(data):
    data = data_preprocessing.remove_h_tags(data)
    data = data_preprocessing.remove_ampersands(data)
    data = data_preprocessing.remove_mentions(data)
    if PREPROCESSING_MODE != 'BASIC':
        data = data_preprocessing.lowercase(data)
    data = data_preprocessing.remove_contractions(data)
    if PREPROCESSING_MODE == 'HEAVY':
        data = data_preprocessing.remove_multiple_quotations(data)
    data = data_preprocessing.remove_extra_spaces(data)
    return data


def get_rows(data):
    rows = [] # will contain par_id, label and text
    for idx in data.index:
        parid = data.loc[idx]['par_id']
        instance = data.loc[idx]
        keyword = instance['keyword']
        country = instance['country']
        text = instance['text']
        if LOADING_MODE == 'c':
            text = country + ' | ' + text
        elif LOADING_MODE == 'k':
            text = keyword + ' | ' + text
        elif LOADING_MODE == 'ck':
            text = country + ' | ' + keyword + ' | ' + text
        rows.append({
            'par_id':parid,
            'text':text,
            'label':instance['label']
        })
    return rows


def load_data(random_state=random_seed):
    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()
    trids = pd.read_csv('data/train_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids = pd.read_csv('data/dev_semeval_parids-labels.csv')
    teids.par_id = teids.par_id.astype(str)
    data = dpm.train_task1_df

    rows_train = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        country = instance.country.values[0]
        rows_train.append({
            'par_id':parid,
            'keyword':keyword,
            'country':country,
            'text':text,
            'label':label
        })

    train_set = pd.DataFrame(rows_train).sample(frac=1, random_state=random_state)
    train_set = preprocess_data(train_set)

    # reindex the train_set
    train_set = train_set.reset_index(drop=True)
    rows_train = get_rows(train_set)
    train_dataset_raw = pd.DataFrame(rows_train)
    train_dataframe = pd.DataFrame(rows_train)

    rows_test = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        instance = data.loc[data.par_id == parid]
        keyword = instance.keyword.values[0]
        text = instance.text.values[0]
        label = instance.label.values[0]
        country = instance.country.values[0]
        rows_test.append({
            'par_id':parid,
            'keyword':keyword,
            'country':country,
            'text':text,
            'label':label
        })

    test_set = pd.DataFrame(rows_test)
    test_set = preprocess_data(test_set)

    rows_test = get_rows(test_set)
    test_dataframe = pd.DataFrame(rows_test)

    print(len(train_dataframe), len(test_dataframe))

    return train_dataframe, test_dataframe



class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        train_input_ids = self.encodings.input_ids[idx]
        train_token_type_ids = self.encodings.token_type_ids[idx]
        train_attention_mask = self.encodings.attention_mask[idx]
        train_labels = self.encodings.label[idx]
        return {
            'input_ids': train_input_ids,
            'token_type_ids': train_token_type_ids,
            'attention_mask': train_attention_mask,
            'labels': train_labels
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [7]:
train_dataset_raw, test_dataset_raw = load_data()

  data_no_mentions.at[index, 'text'] = re.sub(r'@([[a-z]|[A-Z]|[1-9]|0])+\s', '', data_no_mentions['text'][index][:5])


8375 2094


In [8]:
train_dataset_raw.head()

Unnamed: 0,par_id,text,label
0,4355,in-need | The Kindness Institute came about th...,0
1,8147,refugee | Minister Swaminathan forwarded a Cab...,0
2,1728,"migrant | ""Human Rights Watch last month relea...",0
3,3781,"migrant | ""The book is """" Never Look an Americ...",0
4,5083,immigrant | Born to immigrant Indian parents i...,0


In [9]:
# Load the DeBERTa tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

train_text = train_dataset_raw.text.values
test_text = test_dataset_raw.text.values

max_length = 192

encoding_train = tokenizer(train_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)
encoding_test = tokenizer(test_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)

encoding_train['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in train_dataset_raw['label'].tolist()], dtype=torch.float32)
encoding_test['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in test_dataset_raw['label'].tolist()], dtype=torch.float32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Create an instance of the CustomDataset class
train_dataset = CustomDataset(encoding_train)
test_dataset = CustomDataset(encoding_test)

In [11]:
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
model = model.to(device)

epochs = 8
lr = 1e-5
batch_size = 32

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = np.argmax(labels, axis=-1)
    f1 = f1_score(labels, predictions, average='binary')
    return {'f1': f1}

trainingargs = TrainingArguments(
    learning_rate=lr,
    weight_decay=1e-2,
    output_dir='/content/training_results',
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    report_to=None,
    metric_for_best_model="f1",
    save_strategy='epoch',
    load_best_model_at_end=True,
    seed=random_seed,
    optim='adamw_torch',
)

trainer = Trainer(
    model=model,
    args=trainingargs,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer.train()
# Here the best model has been already loaded
trainer.predict(test_dataset)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.208391,0.495726
2,0.240100,0.191128,0.502994
3,0.240100,0.23147,0.430108
4,0.124700,0.231382,0.631325
5,0.124700,0.311043,0.585227
6,0.051700,0.351695,0.567335
7,0.051700,0.364957,0.618557
8,0.027600,0.375086,0.621762


Checkpoint destination directory /content/training_results/checkpoint-262 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/training_results/checkpoint-524 already exists and is non-empty. Saving will proceed but saved results may be invalid.


PredictionOutput(predictions=array([[ 1.558032 , -1.3788724],
       [-1.8434662,  2.0589075],
       [ 3.0188162, -2.8229675],
       ...,
       [ 2.130239 , -2.0509171],
       [ 1.3485687, -1.0937474],
       [ 2.8949938, -2.7066727]], dtype=float32), label_ids=array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.23138169944286346, 'test_f1': 0.6313253012048193, 'test_runtime': 29.8062, 'test_samples_per_second': 70.254, 'test_steps_per_second': 2.214})

In [13]:
trainer.save_model('./results/final_model')

In [24]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
    with open(outf_path,'w') as outf:
        for pi in p:
            outf.write(str(pi)+'\n')

In [18]:
preds_test = trainer.predict(test_dataset)

In [22]:
preds_test.predictions.argmax(axis=1)

array([0, 1, 0, ..., 0, 0, 0])

In [25]:
labels2file(preds_test.predictions.argmax(axis=1), './predictions/dev.txt')

In [100]:
# Load official test set

rows=[]
with open('./data/task4_test.tsv') as f:
    for line in f:
        t=line.strip().split('\t')
        rows.append(t)
official_test = pd.DataFrame(rows, columns="par_id art_id keyword country text".split())


official_test['label'] = [0] * len(official_test)
official_test['par_id'] = [i for i in range(len(official_test))]

official_test_set = pd.DataFrame(official_test)
official_test_ds = preprocess_data(official_test_set)

rows_official_test = get_rows(official_test_ds)
test_official_dataframe_raw = pd.DataFrame(rows_official_test)

In [101]:
official_test_text = test_official_dataframe_raw.text.values
encoding_official_test = tokenizer(official_test_text.tolist(), return_tensors='pt', padding="max_length", truncation=True, max_length=max_length)

In [103]:
encoding_official_test['label'] = torch.tensor([[0,1] if x == 1 else [1,0] for x in test_official_dataframe_raw['label'].tolist()], dtype=torch.float32)
official_test_dataset = CustomDataset(encoding_official_test)

In [108]:
preds_official_test = trainer.predict(official_test_dataset)

In [109]:
labels2file(preds_official_test.predictions.argmax(axis=1), './predictions/test.txt')