<a href="https://colab.research.google.com/github/apoorva1999/TweetPrediciton/blob/main/FineTuningFlanT5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Tweet Classification 🐤


In [1]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/project/TweetPrediciton')

In [3]:
!pip install -r requirements.txt

Collecting transformers==4.53.3 (from -r requirements.txt (line 1))
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 2))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.txt

In [4]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from src.prompt import PROMPT_TEMPLATE
from tqdm import tqdm

# 1. Load and preprocess data 📊


In [5]:
df = pd.read_csv("Q2_20230202_majority.csv")
df = df.dropna(subset=["tweet", "label_true"])

In [6]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data.iloc[idx]["tweet"]
        label = self.data.iloc[idx]["label_true"]
        input_text = PROMPT_TEMPLATE.format(tweet=tweet)
        target_text = label
        input_enc = self.tokenizer(
            input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target_text, truncation=True, padding="max_length", max_length=10, return_tensors="pt"
        )
        return {
            "input_ids": input_enc.input_ids.squeeze(),
            "attention_mask": input_enc.attention_mask.squeeze(),
            "labels": target_enc.input_ids.squeeze(),
        }

In [9]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Create datasets
train_dataset = TweetDataset(train_df, tokenizer)
val_dataset = TweetDataset(val_df, tokenizer)

## Load **flan-t5-large** Model 🤖


In [7]:
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

## Custom Metrics for F1-score


In [11]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
  # f1 = f1_score(labels, preds, average='weighted')
  # return {"f1": f1}

    predictions, labels = eval_pred
    # print("labels : \n")
    # print(labels)
    # print("preds: \n")
    # print(predictions)
    # If labels or predictions are list of lists, flatten them at one level

    if isinstance(predictions, tuple):
        predictions = predictions[0]
    if len(predictions.shape) == 3:
        # Convert logits to token IDs by taking argmax along vocabulary dimension
        predictions = np.argmax(predictions, axis=-1)
    # Decode predictions
    pred_str = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Some models use -100 to mask label tokens; replace with pad_token_id before decoding
    labels = np.where(np.array(labels) != -100, labels, tokenizer.pad_token_id)

    # Decode labels
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip whitespace
    pred_str = [p.strip() for p in pred_str]
    label_str = [l.strip() for l in label_str]

    print("Predictions:", pred_str)
    print("Labels:", label_str)
    # Compute F1
    f1 = {"f1": f1_score(label_str, pred_str, average="weighted")}
    print("F1 Score:", f1)
    return f1

# Training

## Training Hyperparameters ⚙️

In [21]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
    report_to="none",
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=8,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,      # use your actual train_dataset
    eval_dataset=val_dataset, # use your actual val_dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Start training

In [22]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### Saving model for evaluation

In [None]:
trainer.save_model("./results/version_1/")

In [None]:
fine_tuned_model_1 = T5ForConditionalGeneration.from_pretrained("./results/version_1")

In [None]:
fine_tuned_model_1.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

### Function for generating Model Predictions and Calculating F1 Score 📈

In [37]:
def evaluate_with_generate(model, dataset, tokenizer):
    model.eval()
    predictions = []
    references = []

    # Use tqdm for progress bar
    for i in tqdm(range(len(dataset)), desc="Generating predictions"):
        # Get input from dataset
        inputs = dataset[i]
        input_ids = inputs["input_ids"].unsqueeze(0).to(device)
        attention_mask = inputs["attention_mask"].unsqueeze(0).to(device)

        # Generate prediction
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=10
            )

        # Decode prediction and reference
        pred_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()
        label_ids = inputs["labels"]
        # Replace -100 with pad token id
        label_ids = torch.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
        ref_text = tokenizer.decode(label_ids, skip_special_tokens=True).strip()

        predictions.append(pred_text)
        references.append(ref_text)

    # Calculate F1 score
    f1 = f1_score(references, predictions, average="weighted")

    print("Sample predictions:")
    for i in range(min(5, len(predictions))):
        print(f"Pred: {predictions[i]} | Ref: {references[i]}")

    print(f"F1 Score: {f1:.4f}")
    return f1, predictions, references

#### Calculating f1_score using the fine-tuned model

In [None]:
f1_new, preds_new, refs_new = evaluate_with_generate(fine_tuned_model_1, val_dataset, tokenizer)

Generating predictions: 100%|██████████| 1151/1151 [04:05<00:00,  4.70it/s]

Sample predictions:
Pred: in-favor | Ref: in-favor
Pred: in-favor | Ref: against
Pred: in-favor | Ref: in-favor
Pred: neutral-or-unclear | Ref: in-favor
Pred: in-favor | Ref: in-favor
F1 Score: 0.7085





### Tweaking Hyperparameters ⚙️

In [32]:
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./version_2/",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
    report_to="none",
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=8,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    greater_is_better=True,
    label_smoothing_factor=0.1

)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Seq2SeqTrainer(


In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.7765,1.753529,0.727942
2,1.6058,1.579291,0.7611
3,1.55,1.530837,0.82232
4,1.534,1.512304,0.838774
5,1.4446,1.505037,0.8451


Predictions: ['in-favor', 'in', 'in-favor', 'against-favor', 'in-favor', 'in-favor', 'in-or-unclear', 'in', 'in-favor', 'in-favor', 'in-favor', 'against', 'against', 'in-favor', 'against', 'in-favor', 'in-favor', 'in-favor', 'in-favor', 'against-favor', 'against-or-unclear', 'in-favor', 'against-favor', 'in-favor', 'in-favor', 'in-or-unclear', 'in-favor', 'against', 'against', 'against', 'in-favor', 'against-favor', 'in-favor', 'in-favor', 'against-favor', 'in-favor', 'against-favor', 'against', 'in-favor', 'in-favor', 'in-favor', 'in-favor', 'against', 'in-or-unclear', 'in', 'against-or-unclear', 'in-or-unclear', 'against-or-unclear', 'in', 'in-or-unclear', 'against-favor', 'in-favor', 'in-favor', 'in', 'in-favor', 'in-favor', 'in-favor', 'in-favor', 'in-favor', 'against-or-unclear', 'against', 'in', 'against', 'against-or-unclear', 'against', 'in-favor', 'in-favor', 'in-favor', 'against', 'against', 'in-favor', 'in', 'against-favor', 'against', 'against-or-unclear', 'in', 'against', 

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1440, training_loss=1.8241925597190858, metrics={'train_runtime': 4408.7106, 'train_samples_per_second': 5.217, 'train_steps_per_second': 0.327, 'total_flos': 2.6504831434752e+16, 'train_loss': 1.8241925597190858, 'epoch': 5.0})

In [34]:
trainer.save_model()

In [35]:
fine_tuned_model_2 = T5ForConditionalGeneration.from_pretrained("./version_2")

In [39]:
fine_tuned_model_2.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [40]:
f1, preds, refs = evaluate_with_generate(fine_tuned_model_2, val_dataset, tokenizer)

Generating predictions: 100%|██████████| 1151/1151 [04:17<00:00,  4.47it/s]

Sample predictions:
Pred: in-favor | Ref: in-favor
Pred: in-favor | Ref: against
Pred: in-favor | Ref: in-favor
Pred: neutral-or-unclear | Ref: in-favor
Pred: in-favor | Ref: in-favor
F1 Score: 0.7429



