In [1]:
!nvidia-smi

Fri Dec 15 19:04:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet peft
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet accelerate
!pip install --quiet bitsandbytes
!pip install --quiet openpyxl
!pip install --quiet evaluate


In [3]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap



In [4]:
train_path = '/kaggle/input/bias-of-us-news-media-houses/Train.xlsx'

In [5]:
df = pd.read_excel(train_path,engine='openpyxl')

In [6]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,0,immigration,National Review,2,https://www.nationalreview.com/2018/12/governm...,"Shutdown Theater, Again",2018-12-12,"Kevin D. Williamson, Kyle Smith, Andrew C. Mcc...",President Trump and Senate Minority Leader Chu...,President Trump and Senate Minority Leader Chu...,www.nationalreview.com,right,zl7kc7EmAyIdUMIo
1,1,culture,Yahoo! The 360,1,https://news.yahoo.com/can-the-developing-worl...,Can the developing world endure the coronavirus?,2020-06-30,Mike Bebernes,“ The 360 ” shows you diverse perspectives on ...,“The 360” shows you diverse perspectives on th...,www.news.yahoo.com,center,xpbjYTJYPdlw6HmJ
2,2,elections,Politico,0,http://www.politico.com/story/2016/07/bernie-s...,Sanders’ California supporters can’t quite say...,2016-07-02,"Daniel Strauss, Henry C. Jackson, Nick Gass",LOS ANGELES — Actress Rosario Dawson took the ...,LOS ANGELES — Actress Rosario Dawson took the ...,www.politico.com,left,k4SGI3GXarnz5dJl


In [7]:
df = df.iloc[:5000]

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [9]:
MODEL_NAME = "distilbert-base-uncased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
bias_label_dict = {'left': 0, 'center': 1, 'right': 2}
df['label'] = df['bias_text'].map(bias_label_dict)

In [12]:
df["label"].astype(int)

0       2
1       1
2       0
3       1
4       0
       ..
4995    2
4996    0
4997    1
4998    0
4999    1
Name: label, Length: 5000, dtype: int64

In [13]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [14]:
class BiasDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        content = str(self.data.iloc[idx]['content'])
        bias_label = self.data.iloc[idx]['label']

        # Tokenize input text
        inputs = self.tokenizer(
            content,
            truncation=True,
            return_tensors='pt')


        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': bias_label
        }

In [15]:
train_dataset = BiasDataset(train_df, tokenizer)
val_dataset = BiasDataset(val_df, tokenizer)


In [16]:
# Accessing a sample
sample = val_dataset[0]
print(sample)

{'input_ids': tensor([  101,  2034,  2057,  2020,  2409,  2008,  1037,  2845, 23307,  2018,
         3714,  2046,  1996,  3537,  2120,  2837,  1521,  1055,  7588,  1998,
         5407,  2907,  1997,  2049,  6728,  6873,  5371,  2006,  6221,  8398,
         1012,  2085,  2619,  2040,  3632,  2011,  1005, 19739, 14693,  7512,
         1016,  1012,  1014,  1010,  1005,  1037,  7293,  2000,  1996, 22801,
         7056, 23307,  1010,  2003,  6815,  4923,  2005,  5128,  1996,  8398,
         5371,  2041,  2045,  1012, 11721, 26291,  2121,  1998,  1996,  9422,
         3282,  2119,  2405,  1996,  3189,  7483,  1012,  2021,  2054,  1045,
         2424,  2061, 19142,  2003,  2008,  1996,  4559,  2470,  5371,  2003,
         6684,  3561,  2007,  3595,  4933,  1010,  1996,  4031,  1997,  2797,
         2159, 10443,  2039,  6900,  2030,  5086,  1011,  3282, 22889, 13765,
        26830, 18499,  2075,  2058,  5491,  1012,  2023,  4011,  8813, 19817,
        21818,  1010,  7864,  1999,  2285,  1010, 

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
id2label = {0: "left", 1: "center", 2: "right"}
label2id = {"left": 0, "center": 1, "right" : 2}

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           )

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
import torch

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [25]:
import transformers

In [55]:
training_args = transformers.TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [56]:
trainer = transformers.Trainer(
    model=model.to(device),
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.731008,0.678
2,No log,0.652542,0.728


Checkpoint destination directory my_awesome_model/checkpoint-141 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_model/checkpoint-282 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=282, training_loss=0.7790132346728169, metrics={'train_runtime': 293.2209, 'train_samples_per_second': 30.694, 'train_steps_per_second': 0.962, 'total_flos': 1192227849216000.0, 'train_loss': 0.7790132346728169, 'epoch': 2.0})

In [58]:
trainer.state.log_history

[{'eval_loss': 0.7310076951980591,
  'eval_accuracy': 0.678,
  'eval_runtime': 6.7998,
  'eval_samples_per_second': 73.532,
  'eval_steps_per_second': 2.353,
  'epoch': 1.0,
  'step': 141},
 {'eval_loss': 0.6525417566299438,
  'eval_accuracy': 0.728,
  'eval_runtime': 6.8082,
  'eval_samples_per_second': 73.441,
  'eval_steps_per_second': 2.35,
  'epoch': 2.0,
  'step': 282},
 {'train_runtime': 293.2209,
  'train_samples_per_second': 30.694,
  'train_steps_per_second': 0.962,
  'total_flos': 1192227849216000.0,
  'train_loss': 0.7790132346728169,
  'epoch': 2.0,
  'step': 282}]

In [60]:
def predict_bias(idx):
  print("\n\nACTUAL : ",val_df.iloc[idx]['bias_text'])

  inputs = tokenizer(val_df.iloc[idx]['content'],truncation=True,return_tensors="pt").to(device)

  model.eval()

  with torch.no_grad():
    logits = model(**inputs).logits

  predicted_class_id = logits.argmax().item()
  print("\n\n\nPREDICTED BIAS : ",model.config.id2label[predicted_class_id])


In [62]:
predict_bias(3)



ACTUAL :  left



PREDICTED BIAS :  left
