In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from transformers import get_constant_schedule_with_warmup, get_linear_schedule_with_warmup
from tqdm import tqdm
import joblib
from transformers import DistilBertTokenizerFast, BertTokenizerFast
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification
from pathlib import Path
from scipy.special import softmax

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def remove_emoji(s):
    return emoji.get_emoji_regexp().sub(u'', s)
    
def count_trainable_paras(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

def count_word(s, w):
    return s.count(w)
  
def preprocessing_sent(s):
    # Lower
    #s = segment(s)
    #s = ' '.join(s)
    s = s.lower()
    s = s.replace('url', 'http')
    #s = s.replace('#', '')
    #s = s.replace('@', '')
    #s = remove_emoji(s)
    s = emoji.demojize(s)
    return s

# Change the Data Directory here
DATA_DIR = '/home/nmduy/NUIG/data'

# Read Data

In [2]:
# Read and preprocess tweet
data = pd.read_csv(f'{DATA_DIR}/olid-training-v1.0.tsv', sep='\t')
data = data.drop(columns=['subtask_b', 'subtask_c'])
data.columns = ['id', 'tweet', 'label']
label = {'OFF': 1,'NOT': 0}
data.label = [label[item] for item in data.label]
data['tweet_demojize'] = [preprocessing_sent(x) for x in data.tweet.tolist()]
data.label.value_counts()

0    8840
1    4400
Name: label, dtype: int64

In [3]:
# Split training data into train + validate
X = data.tweet_demojize.to_frame()
y = data.label.to_frame()

X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y, test_size=0.2, random_state=1509)

data_train = X_train.copy()
data_train['label'] = y_train.label.tolist()

data_val = X_val.copy()
data_val['label'] = y_val.label.tolist()

data_train.columns = ['tweet', 'label']
data_val.columns = ['tweet', 'label']

In [4]:
# Create TEST SET (do the same)
data_test = pd.read_csv(f'{DATA_DIR}/testset-levela.tsv', sep='\t')
data_test['tweet_demojize'] = [preprocessing_sent(x) for x in data_test.tweet.tolist()]
data_test.columns = ['id', 'tweet', 'tweet_demojize']

# Read label test set
label_test = pd.read_csv(f'{DATA_DIR}/labels-levela.csv', header=None)
label_test.columns = ['id', 'label']
label = {'OFF': 1,'NOT': 0}
data_test_id = data_test['id'].tolist()
data_test_label = label_test['label'].tolist()
label_test.label = [label[item] for item in label_test.label]

data_test = pd.merge(data_test, label_test, on='id')
data_test = data_test.drop(columns=['id', 'tweet'])
data_test.columns = ['tweet', 'label']
data_test.head()

Unnamed: 0,tweet,label
0,#whoisq #wherestheserver #dumpnike #declasfisa...,1
1,"#constitutionday is revered by conservatives, ...",0
2,#foxnews #nra #maga #potus #trump #2ndamendmen...,0
3,#watching #boomer getting the news that she is...,0
4,#nopasaran: unity demo to oppose the far-right...,1


# Apply DistilBERT

In [5]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(pred):
    labels = pred.label_ids
    labels_tensor = torch.tensor(labels)
    preds_logits = torch.tensor(pred.predictions)
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(preds_logits.view(-1,2), labels_tensor.view(-1)) # 2 class
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'lof1': 1 - loss + f1 # take both loss and F1 score into account
    }

In [6]:
X_train = data_train.tweet.tolist()
X_val = data_val.tweet.tolist()
X_test = data_test.tweet.tolist()

Y_train = data_train.label.tolist()
Y_val = data_val.label.tolist()
Y_test = data_test.label.tolist()

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

X_encoding_train = tokenizer(list(map(str, X_train)), truncation=True, padding=True)
X_encoding_val = tokenizer(list(map(str, X_val)), truncation=True, padding=True)
X_encoding_test = tokenizer(list(map(str, X_test)), truncation=True, padding=True)

In [8]:
train_ds = TweetDataset(X_encoding_train, Y_train)
val_ds = TweetDataset(X_encoding_val, Y_val)
test_ds = TweetDataset(X_encoding_test, Y_test)

In [9]:
OUTPUT_DIR = Path('results')
LOG_DIR = Path('logs')
EPOCHES = 6 # If you have strong GPU set it high (6 or more), but for Colab just let it 3
# however it will be converge after 2 - 4 epochs
BATCH_SIZE = 32

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=EPOCHES,              
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE*4,   
    warmup_steps=200,               
    weight_decay=0.015,          
    logging_dir=str(LOG_DIR),
    logging_steps=200,
    evaluation_strategy="epoch",
    seed=1509,
    learning_rate=8e-6,
    load_best_model_at_end=True,
    metric_for_best_model='eval_lof1',
    greater_is_better=True
)

In [10]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
count_trainable_paras(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

66955010

In [11]:
# Train the model
trainer = Trainer(model=model, args=training_args, 
                  train_dataset=train_ds, eval_dataset=val_ds, 
                  compute_metrics = compute_metrics)
trainer.train() # It takes a while (around 20mins for 3 pochs on Google Colab)
# Only keep best model automatically

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Lof1
1,0.6218,0.448208,0.805514,0.7564,0.809001,0.737922,1.308191
2,0.4161,0.419868,0.818353,0.791714,0.797607,0.786919,1.371846
3,0.3955,0.42279,0.817598,0.791744,0.796055,0.788066,1.368955
4,0.3438,0.445205,0.818731,0.790771,0.799242,0.784348,1.345566
5,0.2992,0.457426,0.818731,0.786857,0.803526,0.776357,1.329431
6,0.2884,0.477431,0.813444,0.783837,0.793698,0.776679,1.306406
7,0.2534,0.506182,0.813822,0.783641,0.794743,0.77582,1.277459
8,0.2373,0.512638,0.808157,0.780497,0.785396,0.776429,1.267859


TrainOutput(global_step=2648, training_loss=0.34578607413703943, metrics={'train_runtime': 842.5411, 'train_samples_per_second': 3.143, 'total_flos': 0, 'epoch': 8.0})

In [12]:
# Show test result
# Can be slightly different (less than 1%) with the report due to the randomness
pred_test = trainer.predict(test_ds)
prob_test = softmax(pred_test.predictions, axis=1)[:, 1]
pred_test.metrics

{'test_loss': 0.36703553795814514,
 'test_accuracy': 0.8558139534883721,
 'test_f1': 0.8100855541070958,
 'test_precision': 0.8337758247123805,
 'test_recall': 0.794018817204301,
 'test_lof1': 1.443049669265747,
 'test_runtime': 1.261,
 'test_samples_per_second': 681.993}

In [None]:
trainer.save_model('distilbert_taska.model') # Save model