<a href="https://colab.research.google.com/github/apoorva1999/TweetPrediciton/blob/main/FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TweetPrediction Repo in Drive

In [None]:
import os
os.chdir('/content/drive/MyDrive/project/TweetPrediciton')

Installing requirements

In [None]:
pip install -r requirements.txt



Importing required Libraries

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from src.prompt import PROMPT_TEMPLATE

# 1. Load and preprocess data

In [None]:
df = pd.read_csv("Q2_20230202_majority.csv")

In [None]:
df.head()
print(len(df))

5751


In [None]:
df.head()

Unnamed: 0,tweet_id,created_at,tweet,label_true,month,label_pred
0,1.42191e+18,2021-08-01 19:02:51+00:00,"if you in spanish town, st jago park isn't ful...",in-favor,21-Aug,
1,1.42173e+18,2021-08-01 07:26:37+00:00,breaking report: cdc used rejected study from ...,against,21-Aug,
2,1.42189e+18,2021-08-01 17:42:05+00:00,covid clusters among the vaccinated are killin...,against,21-Aug,
3,1.42198e+18,2021-08-01 23:35:28+00:00,so they had an on-site vaccination at work and...,in-favor,21-Aug,
4,1.4218e+18,2021-08-01 11:39:57+00:00,what a dumb bunny. it's a leaky vaccine and y...,against,21-Aug,


In [None]:
df["label_true"].value_counts(normalize=True).sort_index()

Unnamed: 0_level_0,proportion
label_true,Unnamed: 1_level_1
against,0.313685
in-favor,0.505477
neutral-or-unclear,0.180838


## Tokenize dataset 👩🏻‍🦰

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
model_name = "google/flan-t5-large"

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
tokenizer(df.iloc[0]["tweet"])

{'input_ids': [3, 99, 25, 16, 8438, 1273, 1511, 6, 3, 7, 17, 2662, 839, 2447, 19, 29, 31, 17, 423, 11, 378, 33, 15401, 1224, 5, 369, 129, 39, 12956, 1713, 8938, 75, 8660, 1191, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Normalize because of class imbalance

In [None]:
class_weights = (1 - (df["label_true"].value_counts().sort_index())/len(df)).values
class_weights

array([0.68631542, 0.49452269, 0.81916188])

### Custom Trainer 😎

In [None]:
from transformers import Trainer
from torch import nn
import torch

class WeightedLossTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    print("✅ Updated compute_loss called!")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    labels = inputs.get("labels")
    loss_func = nn.CrossEntropyLoss(weight = class_weights)
    loss = loss_func(logits, labels)
    return (loss, outputs) if return_outputs else loss

In [None]:
from sklearn.metrics import f1_score

In [None]:
def compute_metrics(pred):
  labels = pred.label_true
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  return {"f1": f1}

## Split the dataset into training and test

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
len(train_df), len(val_df)

(4600, 1151)

In [None]:
from transformers import TrainingArguments

batch_size = 8

logging_steps = 10

output_dir = "finetuning-results"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_steps=logging_steps,
    fp16=True,
    push_to_hub=False,
)

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 device


device(type='cuda')

In [None]:
model = model.to(device)

In [None]:
from torch.utils.data import Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, prompt_template=None):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.prompt_template = prompt_template
        self.labels = {"against": 0, "in-favor": 1, "neutral-or-unclear": 2}

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        tweet = row["tweet"]
        label = row["label_true"]

        if self.prompt_template:
            tweet = self.prompt_template.format(tweet=tweet)

        # Use tokenizer without return_tensors
        encoding = self.tokenizer(tweet, truncation=True, padding="max_length")

        # Convert to tensors manually with the right dimensions
        item = {
            "input_ids": torch.tensor(encoding["input_ids"]),
            "attention_mask": torch.tensor(encoding["attention_mask"]),
        }

        # For labels, keep a single value
        item["labels"] = torch.tensor(self.labels[label])

        return item

In [None]:
train_dataset = TweetDataset(train_df, tokenizer, prompt_template=PROMPT_TEMPLATE)
val_dataset = TweetDataset(val_df, tokenizer, prompt_template=PROMPT_TEMPLATE)

In [None]:
trainer = WeightedLossTrainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    tokenizer = tokenizer,
)

  trainer = WeightedLossTrainer(


In [None]:
trainer.train()

✅ Updated compute_loss called!


ValueError: not enough values to unpack (expected 2, got 1)