In [None]:
!pip install transformers datasets torch peft accelerate bitsandbytes tensorboard pandas matplotlib seaborn nltk rouge



In [None]:
import os
import random
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    set_seed
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel
)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

set_seed(42)

def generate_sentiment_dataset(num_samples=100):

  positive_templates = [
    "I absolutely loved {item}. It was {adjective}!",
    "The {item} exceeded my expectations, truly {adjective}.",
    "What a wonderful {item}! I'm so {feeling} about it.",
    "{item} was fantastic! I would definitely recommend it to anyone.",
    "I'm very impressed with {item}. It's {adjective} and worth every penny.",
    "The {item} made my day. It's simply {adjective}.",
    "I had a great experience with {item}. It's {adjective}!",
    "The {item} was a delight. I'm feeling {feeling} after using it.",
    "I can't praise {item} enough! It's {adjective} in every way.",
    "The {item} brings so much joy. I'm {feeling} about my purchase."
  ]

  negative_templates = [
    "I was disappointed with {item}. It was {adjective}.",
    "The {item} fell short of my expectations, truly {adjective}.",
    "What a terrible {item}! I'm so {feeling} about it.",
    "{item} was awful! I would definitely not recommend it to anyone.",
    "I'm very unimpressed with {item}. It's {adjective} and a waste of money.",
    "The {item} ruined my day. It's simply {adjective}.",
    "I had a poor experience with {item}. It's {adjective}!",
    "The {item} was a nightmare. I'm feeling {feeling} after using it.",
    "I can't criticize {item} enough! It's {adjective} in every way.",
    "The {item} brings so much frustration. I'm {feeling} about my purchase."
  ]

  items = [
      "product", "service", "movie", "book", "restaurant", "hotel",
      "experience", "app", "device", "food", "coffee", "concert",
      "vacation", "phone", "laptop", "customer support", "delivery",
      "interface", "game", "website"
  ]

  positive_adjectives = [
      "amazing", "fantastic", "excellent", "outstanding", "perfect",
      "brilliant", "incredible", "superb", "wonderful", "exceptional"
  ]

  negative_adjectives = [
      "disappointing", "terrible", "awful", "poor", "subpar",
      "horrible", "dreadful", "mediocre", "unacceptable", "frustrating"
  ]

  positive_feelings = [
      "happy", "delighted", "thrilled", "excited", "pleased",
      "satisfied", "impressed", "grateful", "ecstatic", "contented"
  ]

  negative_feelings = [
      "upset", "frustrated", "annoyed", "disappointed", "angry",
      "displeased", "irritated", "dissatisfied", "unhappy", "regretful"
  ]

  texts = []
  labels = []

  for _ in range(num_samples // 2):

    template = random.choice(positive_templates)
    item = random.choice(items)
    adjective = random.choice(positive_adjectives)
    feeling = random.choice(positive_feelings)
    text = template.format(item=item, adjective=adjective, feeling=feeling)
    texts.append(text)
    labels.append("positive")

    template = random.choice(negative_templates)
    item = random.choice(items)
    adjective = random.choice(negative_adjectives)
    feeling = random.choice(negative_feelings)
    text = template.format(item=item, adjective=adjective, feeling=feeling)
    texts.append(text)
    labels.append("negative")

  df = pd.DataFrame({"text": texts, "sentiment": labels})

  df = df.sample(frac=1).reset_index(drop=True)

  return df

sentiment_df = generate_sentiment_dataset(100)
print(f"Generate the dataset with {len(sentiment_df)} examples")
print(sentiment_df.head())

train_df = sentiment_df.sample(frac=0.8, random_state=42)
test_df = sentiment_df.drop(train_df.index)

print(f"Train set: {len(train_df)} examples")
print(f"Test set: {len(test_df)} examples")


Generate the dataset with 100 examples
                                                text sentiment
0  The app fell short of my expectations, truly h...  negative
1  I can't criticize restaurant enough! It's frus...  negative
2    I absolutely loved service. It was exceptional!  positive
3  I can't criticize experience enough! It's subp...  negative
4  I can't praise website enough! It's incredible...  positive
Train set: 80 examples
Test set: 20 examples


In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "./models/sentiment_lora_finetuned"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs("./data", exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def format_instruction(example):

  text = example["text"]
  sentiment = example["sentiment"]

  instruction = f"### Instruction:\nClassify the sentiment of the following text as either 'positive' or 'negative'. \n\n### Text:\n{text}\n\n### Sentiment:\n{sentiment}"

  return {"formatted_text": instruction}

print("formatting the dataset...")

train_formatted = train_dataset.map(format_instruction)
test_formatted = test_dataset.map(format_instruction)

def tokenize_function(examples):

  return tokenizer(
      examples["formatted_text"],
      truncation=True,
      max_length=512,
      padding="max_length"
  )

print("Tokenizing the dataset...")
train_tokenized = train_formatted.map(tokenize_function, batched=True, remove_columns=train_formatted.column_names)
test_tokenized = test_formatted.map(tokenize_function, batched=True, remove_columns=test_formatted.column_names)

train_tokenized.save_to_disk("./data/sentiment_train")
test_tokenized.save_to_disk("./data/sentiment_test")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

formatting the dataset...


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing the dataset...


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/80 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print(f"Loading the model: {MODEL_NAME} in full precision")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

model = get_peft_model(model, peft_config)

def print_trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / all_params:.2f}%) of all params")

print_trainable_parameters(model)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    report_to="tensorboard",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting the LoRA finetuning...")

trainer.train()

print("LoRA Finetuning is complete!")

trainer.save_model(f"{OUTPUT_DIR}/final")
print(f"Model saved to: {OUTPUT_DIR}/final")



Loading the model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 in full precision
Trainable parameters: 2252800 (0.20%) of all params


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting the LoRA finetuning...


Step,Training Loss


LoRA Finetuning is complete!
Model saved to: ./models/sentiment_lora_finetuned/final
