In [1]:
!pip install transformers
!pip install adabound

Collecting adabound
  Downloading adabound-0.0.5-py3-none-any.whl (5.1 kB)
Installing collected packages: adabound
Successfully installed adabound-0.0.5


In [2]:
# if running on kaggle notebook
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [3]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset,sampler
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from adabound import AdaBound
import tensorflow as tf



In [4]:
# free colab's GPU
import gc

gc.collect()

torch.cuda.empty_cache()

# Hyperparametrs:

In [5]:
max_length = 128
num_classes =2
batch_size = 32
epochs = 30
patience = 3
lr = 4e-5
train_size = 0.8
model_version = 'vinai/bertweet-large'
output_vector = 768 if model_version in ['roberta-base', 'bert-base'] else (1024 if model_version in ['roberta-large','vinai/bertweet-large'] else 0) # for bert-base and roberta-base and 1024 for roberta-large


In [6]:
output_vector

1024

## Dataset Preparation

In [7]:
class HQP(Dataset):
    def __init__(self, tokenizer,max_length):
        super(HQP, self).__init__()
        self.train_csv=pd.read_csv('/kaggle/input/hqp-dataset/df_tweets_HiQualProp.csv')
        self.tokenizer=tokenizer
        self.target=self.train_csv.iloc[:,2]
        self.max_length=max_length

    def __len__(self):
        return len(self.train_csv)

    def __getitem__(self, index):

        text1 = self.train_csv.iloc[index,9] # normalized_text

        inputs = self.tokenizer.encode_plus(
            text1 ,
            None,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,

            padding='max_length',
        )
        ids = inputs["input_ids"]
        #token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[index], dtype=torch.long)
            }
tokenizer = transformers.AutoTokenizer.from_pretrained(model_version)

dataset= HQP(tokenizer, max_length=max_length)
a = int(len(dataset)*train_size)
train_data, test_data = torch.utils.data.random_split(dataset, [a, len(dataset)-a])

train_dataloader=DataLoader(dataset=train_data,batch_size=batch_size)
test_dataloader=DataLoader(dataset=test_data,batch_size=batch_size)


Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
#for x in train_dataloader:
#    print(x)
#    break

# Pre-Trained Models

In [9]:
class ClassifierModel(nn.Module):
    def __init__(self):
        super(ClassifierModel, self).__init__()
        self.model = transformers.AutoModel.from_pretrained(model_version)
        self.out = nn.Linear(output_vector, 1)

    def forward(self,ids,mask):
        c,o2= self.model(ids,attention_mask=mask, return_dict=False)
        out= self.out(o2)

        return out

model=ClassifierModel()
model.to("cuda")



#weights = compute_class_weight(class_weight='balanced', classes=[0,1], y=dataset.target)
#weights = torch.Tensor(weights)
#weights = weights.to('cuda')
weights = torch.Tensor([6]).to('cuda')


loss_fn = nn.BCEWithLogitsLoss(pos_weight=weights)

#Initialize Optimizer
optimizer= optim.AdamW(model.parameters(),lr= lr)
#optimizer = AdaBound(model.parameters(), lr=1e-5, final_lr=0.001)


Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Identify the parameters of the first 16 layers to freez
for name, param in model.named_parameters():
    # Identify the layers you want to freeze (adjust the condition as needed)
    if "model.encoder.layer" in name and int(name.split(".")[3]) < 16:
        param.requires_grad = False


In [11]:
# propaganda size 
train_data.dataset.target.value_counts()[0]/train_data.dataset.target.value_counts()[1]

5.527569475077194

In [12]:
#def calculate_pos_weights(class_counts):
#  pos_weights = np.ones_like(class_counts)
#  neg_counts = [len(data)-pos_count for pos_count in class_counts]
#  for cdx, pos_count, neg_count in enumerate(zip(class_counts,  neg_counts)):
#    pos_weights[cdx] = neg_count / (pos_count + 1e-5)

#  return torch.as_tensor(pos_weights, dtype=torch.float)


In [13]:
def validate (model=model, test_dataloader = test_dataloader):
  # Set model to eval mode
  model.eval()
  accuracy=0
  f1 = 0
  labelss = torch.Tensor()
  predss = torch.Tensor()
  for i,batch in enumerate(test_dataloader):
    # Forward pass
    with torch.no_grad():
        ids = batch['ids'].to("cuda")
        mask = batch['mask'].to("cuda")
        #token_type = batch['token_type_ids'].to("cuda")
        labels = batch['target'].unsqueeze(1)
        labelss = torch.cat((labelss,labels), dim=0)

        outputs = model(ids,mask)
        siga = nn.Sigmoid()
        cc = siga(outputs)
        preds = np.where(cc.cpu().data.numpy() >= 0.5, 1, 0)
        predss =torch.cat((predss,torch.tensor(preds)), dim=0)

        num_correct = sum(1 for a, b in zip(preds, labels) if a[0] == b[0])
        num_samples = preds.shape[0]
        accuracy += num_correct/num_samples

        #print(f'Training accuracy in batch {i+1} {float(num_correct)/float(num_samples)*100:.2f}')
  model.train()
  accuracy = float(accuracy)/float(i+1)
  #print(f'Accuracy {float(accuracy)/float(i+1)*100:.2f}')
  f1 = f1_score(labelss.flatten().numpy(), predss.flatten().numpy())
  #print(f'F1-score {f1*100:.2f}')
  return accuracy,f1


In [14]:
def finetune(epochs, dataloader, model, loss_fn, optimizer, early_stopping_patience):
    model.train()
    best_test_acc = 0.0
    best_epoch = 0
    no_improvement_count = 0

    for epoch in range(epochs):
        preds = np.array([])
        labels = np.array([])
        loop = tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
        for batch, dl in loop:
            ids = dl['ids'].to("cuda")
            #token_type_ids = dl['token_type_ids'].to("cuda")
            mask = dl['mask'].to("cuda")
            label = dl['target'].unsqueeze(1).to("cuda")


            optimizer.zero_grad()

            output = model(
                ids=ids,
                mask=mask,
                )
            label = label.type_as(output)
            loss = loss_fn(output, label)

            loss.backward()

            optimizer.step()

            # Move output to CPU and convert to NumPy
            output_cpu = output.cpu().detach().numpy()
            sig = nn.Sigmoid()
            x = sig(output)
            pred = np.where( x.cpu().detach().numpy()>= 0.4, 1, 0)
            preds = np.append(preds,pred)
            labels = np.append(labels,label.cpu())


        preds = preds.flatten()
        labels = labels.flatten()
        test_acc, test_f1 = validate(model=model,test_dataloader = test_dataloader)
        print(f'{epoch} epoch | Train Accuracy = {accuracy_score(labels,preds)} | Train F1_score = {f1_score(labels,preds)} | Test Accuracy = {test_acc} | Test F1_score ={test_f1} ')
        if test_f1 > best_test_acc:
            best_test_acc = test_f1
            best_epoch = epoch
            no_improvement_count = 0
        else:
            no_improvement_count += 1

        if no_improvement_count >= early_stopping_patience:
            print(f'No improvement for {early_stopping_patience} epochs. Early stopping.')
            break

    print(f'Best test accuracy of {best_test_acc} achieved at epoch {best_epoch}')

    return model


In [15]:
model=finetune(epochs, train_dataloader, model, loss_fn, optimizer,patience)

                                                 

0 epoch | Train Accuracy = 0.719378273356986 | Train F1_score = 0.48408137909613297 | Test Accuracy = 0.8692567567567567 | Test F1_score =0.6599297012302284 


                                                 

1 epoch | Train Accuracy = 0.8198175367460719 | Train F1_score = 0.6019779809665983 | Test Accuracy = 0.8842905405405406 | Test F1_score =0.6870717222476017 


                                                 

2 epoch | Train Accuracy = 0.8516218955904714 | Train F1_score = 0.6550132573897672 | Test Accuracy = 0.8483108108108108 | Test F1_score =0.6367313915857604 


                                                 

3 epoch | Train Accuracy = 0.8715999324210171 | Train F1_score = 0.6914332115306537 | Test Accuracy = 0.8498310810810811 | Test F1_score =0.6428284451586981 


                                                 

4 epoch | Train Accuracy = 0.8923804696739314 | Train F1_score = 0.7309966216216217 | Test Accuracy = 0.8940878378378379 | Test F1_score =0.7038261691072272 


                                                 

5 epoch | Train Accuracy = 0.9148927183645886 | Train F1_score = 0.7763844190433915 | Test Accuracy = 0.8753378378378378 | Test F1_score =0.6731620903454385 


                                                 

6 epoch | Train Accuracy = 0.9330968068930563 | Train F1_score = 0.8161559888579387 | Test Accuracy = 0.8729729729729729 | Test F1_score =0.6669619131975199 


                                                 

7 epoch | Train Accuracy = 0.9378695725629329 | Train F1_score = 0.8274486803519061 | Test Accuracy = 0.8619932432432432 | Test F1_score =0.6536668079694785 
No improvement for 3 epochs. Early stopping.
Best test accuracy of 0.7038261691072272 achieved at epoch 4
