In [16]:
import json           
import random         
import gzip            
import requests        
import torch           
from peft import get_peft_model, LoraConfig, TaskType  
from torch.utils.data import Dataset, DataLoader  
from transformers import AutoTokenizer, AutoModelForCausalLM  
from torch.optim import AdamW    
from tqdm import tqdm   
import re               


In [23]:
def build_prompt(text):#creating the promt for emotion classificatiom
    return f"predict the meotion for the following text : {text}\nEmotion:"

In [24]:
def encoded_text(tokenizer, text, return_tensor = False):#encoding our text using the gpt2 tokenizer
    if return_tensor:
        return tokenizer.enocde(
            text,add_special_tokens =False ,return_tensor ="pt"
        )
    else :
        return tokenizer.encode(text,add_special_tokens=False)
    
    

In [25]:
def decoded_text(tokenizer,tokens_ids):#tto decode the id back to tokens
    return tokenizer.decode(tokens_ids , skip_special_tokens = True)#by keeping skip_spl_tokjen true we remove all <eos><sod><pad>'s


In [26]:
class PromptCompletionDataset(Dataset):
    def __init__(self,data,tokenizer):
        self.data  = data
        self.tokenizer = tokenizer 
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        item=self.data[idx]
        prompt = item["prompt"]
        completion = item["completion"]
        encoded_prompt = encoded_text(self.tokenizer , prompt)
        encoded_completion = encoded_text(self.tokenizer, completion)
        eos_token =self.tokenizer.eos_token_id
        input_ids = encoded_prompt + encoded_completion + [eos_token]
        labels =[-100]*len(encoded_prompt)+encoded_completion+[eos_token]
        return{
            "input_ids":input_ids,
            "labels":labels,
            "prompt":prompt,
            "expected_completion":completion
        }
        

In [27]:
def collate_fn(batch):
    #get max length string in the bactch
    max_length = max(len(item["input_ids"]) for item in batch)
    #padding the tokens
    input_ids =[
        item["input_ids"]+ [tokenizer.pad_token_id]*(max_length-len(item["input_ids"]))
        for item in batch        
    ]
    labels =[
        item["labels"]+[-100]*(max_length - len(item["labels"]))
        for item in batch
    ]
    attention_mask = [
        [1]* len(item["input_ids"])+
        [0]* (max_length-len(item["input_ids"]))
        for item in batch
    ]
    prompts =[item["prompt"] for item in batch]
    expected_completions = [item["expected_completion"]for item in batch]
    return(
        torch.tensor(input_ids, dtype=torch.long),
        torch.tensor(attention_mask, dtype=torch.long),
        torch.tensor(labels, dtype=torch.long),
        prompts,
        expected_completions
    )
    
    

In [28]:
def normalize_text(text):
    text = text.strip().lower()
    text = re.sub(r'\s+',' ',text)
    return text

In [29]:
def calculate_accuracy(model, tokenizer, loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels, prompts, expected_completions in loader:
            for prompt, expected_completion in zip(prompts, expected_completions):
                generated_text = generate_text(model, tokenizer, prompt)
                if normalize_text(generated_text) == normalize_text(expected_completion):
                    correct += 1
                total += 1

    model.train()
    accuracy = correct / total if total > 0 else 0
    return accuracy

            

In [30]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50):
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        input_ids=input_ids["input_ids"],
        attention_mask=input_ids["attention_mask"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )[0]
    generated_text = decode_text(tokenizer, output_ids[input_ids["input_ids"].shape[1]:])
    return generated_text.strip()


In [31]:
def text_model(model_path,test_inputs):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")    
    print(f"Using device: {device}")
    model =AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer =AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    prompt = build_prompt(test_input)
    generated_text = generate_text(model, tokenizer, prompt)
    print(f"Input: {test_input}")
    print(f"Generated emotion: {generated_text}")

In [32]:
def download_and_prepare_data(data_url,tokenizer,batch_size,test_ratio=0.1):
    response = requests.get(data_url)
    content = gzip.decompress(response.content).decode()
    dataset = []
    for entry in map(json.loads, content.splitlines()):
        dataset.append({
            "prompt": build_prompt(entry['text']),
            "completion": entry["label"].strip()
        })
    random.shuffle(dataset)
    split_index = int(len(dataset) * (1 - test_ratio))
    train_data = dataset[:split_index]
    test_data = dataset[split_index:]
    train_dataset = PromptCompletionDataset(train_data, tokenizer)
    test_dataset = PromptCompletionDataset(test_data, tokenizer)
    train_loader =DataLoader(
        train_dataset,
        batch_size = batch_size,
        shuffle = True,
        collate_fn =collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size = batch_size,
        shuffle =False,
        collate_fn=collate_fn
    )
    return train_loader ,test_loader

    

In [33]:
def get_hyperparameters():
    num_epochs=18
    batch_size = 16
    learning_rate = 5e-5
    return num_epochs,batch_size,learning_rate

In [100]:
if __name__=="__main__":
    data_url = "https://www.thelmbook.com/data/emotions"
    model_name = "openai-community/gpt2"
    
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    print(f"Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token= tokenizer.eos_token
    peft_config =LoraConfig(
        task_type = TaskType.CAUSAL_LM,
        inference_mode = False,
        r=16,
        lora_alpha=32
    )
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model = get_peft_model(model, peft_config)
    num_epochs, batch_size, learning_rate = get_hyperparameters()
    train_loader, test_loader = download_and_prepare_data(data_url, tokenizer, batch_size)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for input_ids, attention_mask, labels, _, _ in progress_bar:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
    
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
    
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    
            # Update metrics
            total_loss += loss.item()
            num_batches += 1
            progress_bar.set_postfix({"Loss": total_loss / num_batches})
    
        avg_loss = total_loss / num_batches
        test_acc = calculate_accuracy(model, tokenizer, test_loader)
        print(f"Epoch {epoch+1} - Average loss: {avg_loss:.4f}, Test accuracy: {test_acc:.4f}")


Using device: mps


Epoch 1/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:24<00:00,  2.93it/s, Loss=0.0364]


Epoch 1 - Average loss: 0.0364, Test accuracy: 0.7750


Epoch 2/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:49<00:00,  2.74it/s, Loss=0.0259]


Epoch 2 - Average loss: 0.0259, Test accuracy: 0.8280


Epoch 3/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [07:33<00:00,  2.48it/s, Loss=0.0222]


Epoch 3 - Average loss: 0.0222, Test accuracy: 0.8610


Epoch 4/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:45<00:00,  2.78it/s, Loss=0.0156]


Epoch 4 - Average loss: 0.0156, Test accuracy: 0.8895


Epoch 5/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:45<00:00,  2.77it/s, Loss=0.0129]


Epoch 5 - Average loss: 0.0129, Test accuracy: 0.9065


Epoch 6/18: 100%|██████████████████████████████████████████████████████████████████████| 1125/1125 [06:46<00:00,  2.77it/s, Loss=0.015]


Epoch 6 - Average loss: 0.0150, Test accuracy: 0.9215


Epoch 7/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:45<00:00,  2.78it/s, Loss=0.0119]


Epoch 7 - Average loss: 0.0119, Test accuracy: 0.9240


Epoch 8/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:45<00:00,  2.77it/s, Loss=0.0132]


Epoch 8 - Average loss: 0.0132, Test accuracy: 0.9280


Epoch 9/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:43<00:00,  2.79it/s, Loss=0.0125]


Epoch 9 - Average loss: 0.0125, Test accuracy: 0.9245


Epoch 10/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:44<00:00,  2.78it/s, Loss=0.017]


Epoch 10 - Average loss: 0.0170, Test accuracy: 0.9240


Epoch 11/18: 100%|█████████████████████████████████████████████████████████████████████| 1125/1125 [06:46<00:00,  2.77it/s, Loss=0.023]


Epoch 11 - Average loss: 0.0230, Test accuracy: 0.9150


Epoch 12/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:46<00:00,  2.77it/s, Loss=0.0323]


Epoch 12 - Average loss: 0.0323, Test accuracy: 0.9030


Epoch 13/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:49<00:00,  2.75it/s, Loss=0.0396]


Epoch 13 - Average loss: 0.0396, Test accuracy: 0.8790


Epoch 14/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:48<00:00,  2.75it/s, Loss=0.0455]


Epoch 14 - Average loss: 0.0455, Test accuracy: 0.8670


Epoch 15/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:49<00:00,  2.75it/s, Loss=0.0528]


Epoch 15 - Average loss: 0.0528, Test accuracy: 0.8390


Epoch 16/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:48<00:00,  2.75it/s, Loss=0.0415]


Epoch 16 - Average loss: 0.0415, Test accuracy: 0.8215


Epoch 17/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:47<00:00,  2.76it/s, Loss=0.0437]


Epoch 17 - Average loss: 0.0437, Test accuracy: 0.8345


Epoch 18/18: 100%|████████████████████████████████████████████████████████████████████| 1125/1125 [06:48<00:00,  2.75it/s, Loss=0.0401]


Epoch 18 - Average loss: 0.0401, Test accuracy: 0.8590


In [102]:
model.save_pretrained("lora_finetuned_gpt2")
tokenizer.save_pretrained("lora_finetuned_gpt2")


('lora_finetuned_gpt2/tokenizer_config.json',
 'lora_finetuned_gpt2/special_tokens_map.json',
 'lora_finetuned_gpt2/vocab.json',
 'lora_finetuned_gpt2/merges.txt',
 'lora_finetuned_gpt2/added_tokens.json',
 'lora_finetuned_gpt2/tokenizer.json')

In [40]:

text_model("lora_finetuned_gpt2", "I am so happy today!")



Using device: mps

--- Generated Output ---
I am so happy today!
