<a href="https://colab.research.google.com/github/DivyaDharshini2293/Personality-Based-Text-Generation/blob/main/GPT_2_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
!pip install transformers
# Define a custom dataset class for personality-based text generation




In [None]:
class PersonalityTextDataset(Dataset):
    def __init__(self, texts, personalities, tokenizer, max_length=128):
        self.texts = texts
        self.personalities = personalities
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = self.texts[idx]
        personality = self.personalities[idx]

        input_encoding = self.tokenizer.encode_plus(
            personality + input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten()
        }


In [None]:

# Define function to collate samples into batches
def collate_fn(samples):
    input_ids = [sample['input_ids'] for sample in samples]
    attention_mask = [sample['attention_mask'] for sample in samples]

    return {
        'input_ids': pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id),
        'attention_mask': pad_sequence(attention_mask, batch_first=True, padding_value=0)
    }

# Load pre-trained GPT-2 model and tokenizer


In [None]:
model_name = 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Prepare dataset and dataloader
texts = ['I like to go for a walk in the park.', 'I enjoy reading books in my free time.']
personalities = ['Friendly: ', 'Intellectual: ']
dataset = PersonalityTextDataset(texts, personalities, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids']
        max_value = torch.max(input_ids).item()
        # Get the vocabulary size
        vocab_size = model.get_input_embeddings().weight.size(0)
        # Clip the values in the input_ids tensor to the maximum allowed value
        input_ids = torch.clamp(input_ids, 0, vocab_size - 1)
        attention_mask = batch['attention_mask']
        #print(f"Input IDs shape: {input_ids.shape}")
        #print(f"Input IDs: {input_ids}")
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens
        vocab_size = model.get_input_embeddings().weight.size(0)
        print(f"Vocabulary size: {vocab_size}")

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# Generate text with personality
def generate_text(model, tokenizer, personality, prompt_text, max_length=100):
    input_text = personality + prompt_text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids=input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage
personality = 'Friendly: '
prompt_text = 'I enjoy spending time with my friends.'

generated_text = generate_text(model, tokenizer, personality, prompt_text)
print(generated_text)

Vocabulary size: 50257
Epoch 1, Loss: 10.777785301208496
Vocabulary size: 50257
Epoch 2, Loss: 3.310055732727051
Vocabulary size: 50257


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 3, Loss: 1.0550954341888428
Friendly: I enjoy spending time with my friends.
