## NLP Assignment - 3
### SST-2 Classification Task

#### Necessary Installs

In [1]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


#### Imports

In [2]:
import numpy as np
import pandas as pd
import os, sys, time, torch, wandb, warnings

from transformers import (TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support 

from huggingface_hub import login

from datasets import load_dataset
from evaluate import load
from prettytable import PrettyTable

#### Supressing Warnings

In [3]:
print(os.getcwd())
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

/kaggle/working


#### Setting up Environment Variables

In [None]:
os.environ["Bpp06"] = ""    # Paste your access token from github
os.environ['CURL_CA_BUNDLE']=''

#### Logging-in to HuggingFace

In [None]:
login(token=os.getenv("Bpp06"), add_to_git_credential=True)

#### Model Name

In [6]:
model_name = "meta-llama/Llama-3.2-1B"

#### Defining Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv("Bpp06"))
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_tokenHitesh

#### Checking for CUDA Compatibility

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


#### Initializing Model

In [None]:
sst_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
sst_model.config.pad_token_id = sst_model.config.eos_token_id

#### Parameter Table

In [10]:
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)

In [11]:
count_parameters(sst_model)

+-------------------------------------------------+------------+
|                     Modules                     | Parameters |
+-------------------------------------------------+------------+
|            model.embed_tokens.weight            | 262668288  |
|      model.layers.0.self_attn.q_proj.weight     |  4194304   |
|      model.layers.0.self_attn.k_proj.weight     |  1048576   |
|      model.layers.0.self_attn.v_proj.weight     |  1048576   |
|      model.layers.0.self_attn.o_proj.weight     |  4194304   |
|       model.layers.0.mlp.gate_proj.weight       |  16777216  |
|        model.layers.0.mlp.up_proj.weight        |  16777216  |
|       model.layers.0.mlp.down_proj.weight       |  16777216  |
|      model.layers.0.input_layernorm.weight      |    2048    |
|  model.layers.0.post_attention_layernorm.weight |    2048    |
|      model.layers.1.self_attn.q_proj.weight     |  4194304   |
|      model.layers.1.self_attn.k_proj.weight     |  1048576   |
|      model.layers.1.sel

#### Trainable Parameters before Freezing the Base Model Parameters

In [12]:
total_params = sum(p.numel() for p in sst_model.parameters())
base_total_params = sum(p.numel() for p in sst_model.base_model.parameters())
trainable_params = sum(p.numel() for p in sst_model.parameters() if p.requires_grad)

print("Parameters before freezing the base model parameters")

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total Base model parameters: {base_total_params:,}")

Parameters before freezing the base model parameters
Total parameters: 1,235,818,496
Trainable parameters: 1,235,818,496
Total Base model parameters: 1,235,814,400


#### Freezing the parameters in the Base Language Model

In [13]:
for param in sst_model.base_model.parameters():
    param.requires_grad = False

#### Trainable Parameters after Freezing the Base Model Parameters

In [14]:
total_params = sum(p.numel() for p in sst_model.parameters())
base_total_params = sum(p.numel() for p in sst_model.base_model.parameters())
trainable_params = sum(p.numel() for p in sst_model.parameters() if p.requires_grad)

print("Parameters after freezing the base model parameters")

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total Base model parameters: {base_total_params:,}")

Parameters after freezing the base model parameters
Total parameters: 1,235,818,496
Trainable parameters: 4,096
Total Base model parameters: 1,235,814,400


#### Loading and Splitting the SST-2 Dataset

In [None]:
sst2_dataset= load_dataset("glue","sst2")

train_test_split = sst2_dataset["train"].train_test_split(
    test_size=0.2,
    seed=1,
    stratify_by_column="label"
)

train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

#### Preprocessing the Data

In [16]:
def preprocess_data(examples):
    tokenized = tokenizer(
        examples["sentence"],
        truncation=False,
        return_overflowing_tokens=True,
        max_length=128,
        stride=64
    )
    return tokenized

In [None]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

#### Training Arguments

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_steps=500,
    eval_steps = 500,
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    fp16=True,  # Mixed-precision training
    logging_dir='./logs',          # Directory for logs
    logging_steps=10,              # Log every 10 steps
    load_best_model_at_end=True,   # Load best model at the end of training
    report_to="wandb",  # Disable W&B if not configured
)

#### Evaluation Metrics

In [None]:
accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

#### Defining the Trainer

In [21]:
trainer = Trainer(
    model=sst_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#### Evaluating the Model before Training

In [22]:
eval_results = trainer.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113604244443801, max=1.0…

#### Evaluation Results

In [23]:
eval_results

{'eval_loss': 1.744381308555603,
 'eval_accuracy': 0.5577579806978471,
 'eval_precision': 0.5578423359833569,
 'eval_recall': 0.9992014905509715,
 'eval_f1': 0.7159681495255806,
 'eval_runtime': 517.9621,
 'eval_samples_per_second': 26.006,
 'eval_steps_per_second': 1.626}

# Training the model

In [24]:
wandb.init(project="huggingface", name="llama_telugu")
wandb.watch(sst_model, log="all", log_freq=100)
trainer.train()

VBox(children=(Label(value='0.025 MB of 0.025 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/global_step,▁

0,1
eval/accuracy,0.55776
eval/f1,0.71597
eval/loss,1.74438
eval/precision,0.55784
eval/recall,0.9992
eval/runtime,517.9621
eval/samples_per_second,26.006
eval/steps_per_second,1.626
train/global_step,0.0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3237,0.302068,0.875872,0.921622,0.849747,0.884227
2,0.3271,0.281053,0.887528,0.917699,0.87703,0.896904
3,0.2571,0.2588,0.895397,0.919127,0.89087,0.904778


TrainOutput(global_step=2526, training_loss=0.36077885960928524, metrics={'train_runtime': 12712.1284, 'train_samples_per_second': 12.715, 'train_steps_per_second': 0.199, 'total_flos': 3.231502526737613e+16, 'train_loss': 0.36077885960928524, 'epoch': 3.0})

#### Evaluation the Model after Training

In [25]:
eval_results = trainer.evaluate()

#### Evaluation Results

In [26]:
eval_results

{'eval_loss': 0.25879958271980286,
 'eval_accuracy': 0.8953971789161099,
 'eval_precision': 0.9191267334889469,
 'eval_recall': 0.890870375299441,
 'eval_f1': 0.9047779955396364,
 'eval_runtime': 761.3259,
 'eval_samples_per_second': 17.693,
 'eval_steps_per_second': 1.106,
 'epoch': 3.0}

#### Pushing the Model to HuggingFace

In [28]:
sst_model.push_to_hub("bp03/Classification_SST2_Llama_3.2_1B_Model")

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/bp03/Classification_SST2_Llama_3.2_1B_Model/commit/a5791fbb9a37dafc92308c505068c2276446b62a', commit_message='Upload LlamaForSequenceClassification', commit_description='', oid='a5791fbb9a37dafc92308c505068c2276446b62a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/bp03/Classification_SST2_Llama_3.2_1B_Model', endpoint='https://huggingface.co', repo_type='model', repo_id='bp03/Classification_SST2_Llama_3.2_1B_Model'), pr_revision=None, pr_num=None)