<a href="https://colab.research.google.com/github/ambideXtrous9/Finetune-LLMs-using-LoRA-in-Colab-on-Custom-Datasets/blob/main/Finetune_LLM_on_Custom_Dataset_using_LoRA_in_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Dec 15 14:24:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0              25W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet peft
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet accelerate
!pip install --quiet bitsandbytes
!pip install --quiet openpyxl
!pip install --quiet evaluate


In [3]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

In [4]:
pl.seed_everything (42)

INFO:lightning_fabric.utilities.seed:Seed set to 42


42

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
train_path = '/content/drive/MyDrive/Article Bias Prediction/Train.xlsx'

In [7]:
df = pd.read_excel(train_path,engine='openpyxl')

In [8]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,0,immigration,National Review,2,https://www.nationalreview.com/2018/12/governm...,"Shutdown Theater, Again",2018-12-12,"Kevin D. Williamson, Kyle Smith, Andrew C. Mcc...",President Trump and Senate Minority Leader Chu...,President Trump and Senate Minority Leader Chu...,www.nationalreview.com,right,zl7kc7EmAyIdUMIo
1,1,culture,Yahoo! The 360,1,https://news.yahoo.com/can-the-developing-worl...,Can the developing world endure the coronavirus?,2020-06-30,Mike Bebernes,“ The 360 ” shows you diverse perspectives on ...,“The 360” shows you diverse perspectives on th...,www.news.yahoo.com,center,xpbjYTJYPdlw6HmJ
2,2,elections,Politico,0,http://www.politico.com/story/2016/07/bernie-s...,Sanders’ California supporters can’t quite say...,2016-07-02,"Daniel Strauss, Henry C. Jackson, Nick Gass",LOS ANGELES — Actress Rosario Dawson took the ...,LOS ANGELES — Actress Rosario Dawson took the ...,www.politico.com,left,k4SGI3GXarnz5dJl


In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [10]:
MODEL_NAME = "distilbert-base-uncased"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [12]:
bias_label_dict = {'left': 0, 'center': 1, 'right': 2}
df['label'] = df['bias_text'].map(bias_label_dict)

In [13]:
df["label"].astype(int)

0        2
1        1
2        0
3        1
4        0
        ..
26585    0
26586    1
26587    1
26588    1
26589    0
Name: label, Length: 26590, dtype: int64

In [14]:
train_df, val_df = train_test_split(df,test_size=0.1)

In [15]:
train_dataset = [
    {"label": label, "text": text}
    for label, text in zip(train_df['label'], df['content'])
]

In [16]:
val_dataset = [
    {"label": label, "text": text}
    for label, text in zip(val_df['label'], df['content'])
]

In [17]:
from datasets import Dataset
import torch

In [18]:
train_dataset = Dataset.from_dict({"label": [sample["label"] for sample in train_dataset],
                                   "text": [sample["text"] for sample in train_dataset]})


In [19]:
train_dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 23931
})

In [20]:
val_dataset = Dataset.from_dict({"label": [sample["label"] for sample in val_dataset],
                                   "text": [sample["text"] for sample in val_dataset]})


In [21]:
val_dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 2659
})

In [22]:
val_dataset[0]

{'label': 0,
 'text': 'President Trump and Senate Minority Leader Chuck Schumer talk in the Oval Office , December 11 , 2018 . ( Kevin Lamarque/Reuters )\nThe promise of this kind of spectacle is about one half of why Donald Trump was elected .\nNew York City ’ s annual Shakespeare in the Park is the worst kind of theater . Washington ’ s annual government-shutdown drama is the second-worst kind .\nI wrote off the alfresco performances in Central Park after the Public Theater decided that what Shakespeare ’ s A Winter ’ s Tale really needed was a political speech by Senator Schumer , who wandered onto the stage at one point — after Bill de Blasio ’ s campaign-rally speech but before the Muppets , if I recall that slightly surreal evening correctly — to make a few of his habitually banal political observations before shouting “ Vote Democratic ! ” and wandering off .\nImprovisation is not the senator ’ s forte . When President Donald Trump surprised Senator Schumer and Representative Na

In [26]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [27]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/23931 [00:00<?, ? examples/s]

In [28]:
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2659 [00:00<?, ? examples/s]

In [29]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
import evaluate

accuracy = evaluate.load("accuracy")

In [31]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [32]:
id2label = {0: "left", 1: "center", 2: "right"}
label2id = {"left": 0, "center": 1, "right" : 2}

In [68]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
#torch.set_default_dtype(torch.float32)

In [70]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [71]:
import transformers

In [72]:
training_args = transformers.TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [73]:
trainer = transformers.Trainer(
    model=model,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_val,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [74]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0917,1.092956,0.382475


KeyboardInterrupt: ignored

In [75]:
text = "President Trump and Senate Minority Leader Chuck Schumer talk in the Oval Office , December 11 , 2018 . ( Kevin Lamarque/Reuters )\nThe promise of this kind of spectacle is about one half of why Donald Trump was elected .\nNew York City ’ s annual Shakespeare in the Park is the worst kind of theater . Washington ’ s annual government-shutdown drama is the second-worst kind .\nI wrote off the alfresco performances in Central Park after the Public Theater decided that what Shakespeare ’ s A Winter ’ s Tale really needed was a political speech by Senator Schumer , who wandered onto the stage at one point — after Bill de Blasio ’ s campaign-rally speech but before the Muppets , if I recall that slightly surreal evening correctly — to make a few of his habitually banal political observations before shouting “ Vote Democratic ! ” and wandering off .\nImprovisation is not the senator ’ s forte . When President Donald Trump surprised Senator Schumer and Representative Nancy Pelosi — the other half of the Democrats ’ noisome slapstick-comedy duo — by broadcasting their acrimonious Oval Office meeting , the anguine gentleman from New York was caught off-guard .\nSenator Schumer and Representative Pelosi invoked the word “ shutdown ” as though it were a magical incantation ."

In [78]:
def predict_bias(text):

  inputs = tokenizer(text, return_tensors="pt").to(device)

  model.eval()

  with torch.no_grad():
    logits = model(**inputs).logits

  predicted_class_id = logits.argmax().item()
  print("BIAS : ",model.config.id2label[predicted_class_id])


In [79]:
predict_bias(text)

BIAS :  right
