In [2]:
!nvidia-smi

Thu Mar 21 17:29:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0              27W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet peft
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet accelerate
!pip install --quiet bitsandbytes
!pip install --quiet openpyxl
!pip install --quiet evaluate


In [3]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

In [4]:
train_path = '/kaggle/input/bias-of-us-news-media-houses/Train.xlsx'

In [5]:
df = pd.read_excel(train_path,engine='openpyxl')

In [6]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,0,immigration,National Review,2,https://www.nationalreview.com/2018/12/governm...,"Shutdown Theater, Again",2018-12-12,"Kevin D. Williamson, Kyle Smith, Andrew C. Mcc...",President Trump and Senate Minority Leader Chu...,President Trump and Senate Minority Leader Chu...,www.nationalreview.com,right,zl7kc7EmAyIdUMIo
1,1,culture,Yahoo! The 360,1,https://news.yahoo.com/can-the-developing-worl...,Can the developing world endure the coronavirus?,2020-06-30,Mike Bebernes,“ The 360 ” shows you diverse perspectives on ...,“The 360” shows you diverse perspectives on th...,www.news.yahoo.com,center,xpbjYTJYPdlw6HmJ
2,2,elections,Politico,0,http://www.politico.com/story/2016/07/bernie-s...,Sanders’ California supporters can’t quite say...,2016-07-02,"Daniel Strauss, Henry C. Jackson, Nick Gass",LOS ANGELES — Actress Rosario Dawson took the ...,LOS ANGELES — Actress Rosario Dawson took the ...,www.politico.com,left,k4SGI3GXarnz5dJl


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

2024-03-21 17:30:07.400792: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 17:30:07.400882: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 17:30:07.532645: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
MODEL_NAME = "distilbert-base-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
bias_label_dict = {'left': 0, 'center': 1, 'right': 2}
df['label'] = df['bias_text'].map(bias_label_dict)

In [11]:
df["label"].astype(int)

0        2
1        1
2        0
3        1
4        0
        ..
26585    0
26586    1
26587    1
26588    1
26589    0
Name: label, Length: 26590, dtype: int64

In [12]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [20]:
class BiasDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        content = str(self.data.iloc[idx]['content'])
        title = str(self.data.iloc[idx]['title'])
        
        
        bias_label = self.data.iloc[idx]['label']

        # Tokenize input text
        inputs = self.tokenizer(title,
                                content,
                                max_length = 200,
                                padding = "max_length",
                                truncation = "only_second",
                                return_attention_mask = True,
                                add_special_tokens = True,
                                return_tensors='pt')


        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': bias_label
        }

In [21]:
class NQADataModule(pl.LightningDataModule):
  def __init__(self,train_df,test_df,tokenizer,batch_size):
    super().__init__()
    
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer

  def setup(self,stage=None):
    self.train_dataset = BiasDataset(self.train_df,self.tokenizer)
    self.test_dataset = BiasDataset(self.test_df,self.tokenizer)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True,drop_last=True,num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.test_dataset,batch_size = self.batch_size,drop_last=True,num_workers=4)

In [22]:
BS = 64

In [23]:
data_module = NQADataModule(X_train,X_test,tokenizer=tokenizer,batch_size=BS)
data_module.setup()

In [24]:
# Accessing a sample
for idx,data in enumerate(data_module.val_dataloader()):
    print(data['input_ids'].shape)
    print(data['labels'].shape)
    break

torch.Size([64, 200])
torch.Size([64])


In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [27]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
id2label = {0: "left", 1: "center", 2: "right"}
label2id = {"left": 0, "center": 1, "right" : 2}

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           )

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
import torch

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [32]:
import transformers

In [34]:
training_args = transformers.TrainingArguments(
    output_dir="BERT-BPM",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [37]:
trainer = transformers.Trainer(
    model=model.to(device),
    train_dataset = data_module.train_dataset,
    eval_dataset = data_module.test_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [38]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.549255,0.775668
2,0.650900,0.501686,0.795036
3,0.650900,0.473971,0.815532
4,0.389500,0.501152,0.818729
5,0.233500,0.602509,0.811019
6,0.233500,0.639201,0.823806
7,0.145500,0.698815,0.819669
8,0.083400,0.826407,0.802181
9,0.083400,0.935478,0.812335
10,0.060100,0.924067,0.82437


TrainOutput(global_step=6660, training_loss=0.1269321791313074, metrics={'train_runtime': 7502.0696, 'train_samples_per_second': 56.71, 'train_steps_per_second': 0.888, 'total_flos': 2.2014818410176e+16, 'train_loss': 0.1269321791313074, 'epoch': 20.0})

In [60]:
def predict_bias(idx):
  print("\n\nACTUAL : ",val_df.iloc[idx]['bias_text'])

  inputs = tokenizer(val_df.iloc[idx]['content'],truncation=True,return_tensors="pt").to(device)

  model.eval()

  with torch.no_grad():
    logits = model(**inputs).logits

  predicted_class_id = logits.argmax().item()
  print("\n\n\nPREDICTED BIAS : ",model.config.id2label[predicted_class_id])


In [62]:
predict_bias(3)





ACTUAL :  left







PREDICTED BIAS :  left
