In [1]:
# Everything is the same with Task A
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from transformers import get_constant_schedule_with_warmup, get_linear_schedule_with_warmup
from tqdm import tqdm
import joblib
from transformers import DistilBertTokenizerFast, BertTokenizerFast
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification
from pathlib import Path
from scipy.special import softmax

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def remove_emoji(s):
    return emoji.get_emoji_regexp().sub(u'', s)
    
def count_trainable_paras(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

def count_word(s, w):
    return s.count(w)
  
def preprocessing_sent(s):
    # Lower
    #s = segment(s)
    #s = ' '.join(s)
    s = s.lower()
    s = s.replace('url', 'http')
    #s = s.replace('#', '')
    #s = s.replace('@', '')
    #s = remove_emoji(s)
    s = emoji.demojize(s)
    return s

# Change the Data Directory here
DATA_DIR = '/home/nmduy/NUIG/data'

# Read Data

In [2]:
# Read and preprocess tweet
data = pd.read_csv(f'{DATA_DIR}/olid-training-v1.0.tsv', sep='\t')
data = data.drop(columns=['subtask_a', 'subtask_c'])
data = data.dropna(subset=['subtask_b'])
data.columns = ['id', 'tweet', 'label']
label = {'TIN': 1,'UNT': 0}
data.label = [label[item] for item in data.label]
data['tweet_demojize'] = [preprocessing_sent(x) for x in data.tweet.tolist()]

In [3]:
X = data.tweet_demojize.to_frame()
y = data.label.to_frame()

X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y, test_size=0.2, random_state=1509)

data_train = X_train.copy()
data_train['label'] = y_train.label.tolist()

data_val = X_val.copy()
data_val['label'] = y_val.label.tolist()

data_train.columns = ['tweet', 'label']
data_val.columns = ['tweet', 'label']

# Create TEST SET
data_test = pd.read_csv(f'{DATA_DIR}/testset-levelb.tsv', sep='\t')
data_test['tweet_demojize'] = [preprocessing_sent(x) for x in data_test.tweet.tolist()]
data_test.columns = ['id', 'tweet', 'tweet_demojize']

# Read label test set
label_test = pd.read_csv(f'{DATA_DIR}/labels-levelb.csv', header=None)
label_test.columns = ['id', 'label']
label = {'TIN': 1,'UNT': 0}
data_test_id = data_test['id'].tolist()
data_test_label = label_test['label'].tolist()
label_test.label = [label[item] for item in label_test.label]

data_test = pd.merge(data_test, label_test, on='id')
data_test = data_test.drop(columns=['id', 'tweet'])
data_test.columns = ['tweet', 'label']
data_test.head()

Unnamed: 0,tweet,label
0,#whoisq #wherestheserver #dumpnike #declasfisa...,1
1,#nopasaran: unity demo to oppose the far-right...,1
2,. . . what the fuck did he do this time?,1
3,@user do you get the feeling he is kissing @us...,1
4,@user nigga ware da hits at,0


# Apply BERT

In [4]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def compute_metrics(pred):
    labels = pred.label_ids
    labels_tensor = torch.tensor(labels)
    preds_logits = torch.tensor(pred.predictions)
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(preds_logits.view(-1,2), labels_tensor.view(-1)) # 2 class
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'lof1': 1 - loss + f1 # take both loss and F1 score into account
    }

In [5]:
X_train = data_train.tweet.tolist()
X_val = data_val.tweet.tolist()
X_test = data_test.tweet.tolist()

Y_train = data_train.label.tolist()
Y_val = data_val.label.tolist()
Y_test = data_test.label.tolist()

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

X_encoding_train = tokenizer(list(map(str, X_train)), truncation=True, padding=True)
X_encoding_val = tokenizer(list(map(str, X_val)), truncation=True, padding=True)
X_encoding_test = tokenizer(list(map(str, X_test)), truncation=True, padding=True)

train_ds = TweetDataset(X_encoding_train, Y_train)
val_ds = TweetDataset(X_encoding_val, Y_val)
test_ds = TweetDataset(X_encoding_test, Y_test)

In [22]:
OUTPUT_DIR = Path('results_B')
LOG_DIR = Path('logs_B')
EPOCHES = 10 # converge after 7-8 epochs
BATCH_SIZE = 32

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=EPOCHES,              
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE*4,   
    warmup_steps=200,               
    weight_decay=0.01,          
    logging_dir=str(LOG_DIR),
    logging_steps=200,
    evaluation_strategy="epoch",
    seed=1509,
    learning_rate=8e-6,
    load_best_model_at_end=True,
    metric_for_best_model='eval_lof1',
    greater_is_better=True
)

In [23]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [24]:
trainer = Trainer(model=model, args=training_args, 
                  train_dataset=train_ds, eval_dataset=val_ds, 
                  compute_metrics = compute_metrics)
trainer.train()
# 1 or 2 first epoch will have a very bad result but it will improve later
# only save best model

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Lof1
1,No log,0.382505,0.880682,0.468278,0.440341,0.5,1.085773
2,0.489300,0.349925,0.880682,0.468278,0.440341,0.5,1.118353
3,0.489300,0.345133,0.880682,0.468278,0.440341,0.5,1.123146
4,0.329200,0.338468,0.879545,0.467956,0.440273,0.499355,1.129488
5,0.329200,0.335632,0.880682,0.519328,0.692972,0.5247,1.183696
6,0.299700,0.345958,0.878409,0.552469,0.67426,0.543994,1.206511
7,0.299700,0.361933,0.875,0.617567,0.677443,0.595576,1.255634
8,0.247400,0.372051,0.875,0.608611,0.67455,0.587343,1.236559
9,0.247400,0.391158,0.873864,0.602718,0.668269,0.582581,1.21156
10,0.201600,0.396695,0.873864,0.602718,0.668269,0.582581,1.206024


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1100, training_loss=0.30050459081476383, metrics={'train_runtime': 226.6638, 'train_samples_per_second': 4.853, 'total_flos': 0, 'epoch': 10.0})

In [25]:
pred_test = trainer.predict(test_ds)
prob_test = softmax(pred_test.predictions, axis=1)[:, 1]
pred_test.metrics

{'test_loss': 0.27218863368034363,
 'test_accuracy': 0.8916666666666667,
 'test_f1': 0.719626168224299,
 'test_precision': 0.7274418604651163,
 'test_recall': 0.7125717266562337,
 'test_lof1': 1.4474376440048218,
 'test_runtime': 0.2513,
 'test_samples_per_second': 955.144}

In [None]:
data = data.dropna(subset=['subtask_b'])