In [None]:
!pip install transformers datasets tiktoken accelerate

In [2]:
import json
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
import tiktoken
from accelerate import Accelerator

In [3]:
device_count = torch.cuda.device_count()
print(f"Number of GPUs available: {device_count}")

Number of GPUs available: 2


In [None]:
# Load and Preprocess the Dataset
def load_and_prepare_data(file_path):
    # Load data from jsonl file
    dataset = load_dataset('json', data_files=file_path, split='train')

    # Format each example in the desired conversation style
    def format_conversation(example):
        messages = example['messages']
        text = ""
        for message in messages:
            if message['role'] == 'system':
                text += f"<|system|> {message['content']} <|endoftext|>\n"
            elif message['role'] == 'user':
                text += f"<|user|> {message['content']} <|endoftext|>\n"
            elif message['role'] == 'assistant':
                text += f"<|assistant|> {message['content']} <|endoftext|>\n"
            # text+=" <|endoftext|>"
        return {'text': text}

    # Apply the formatting function to each example
    dataset = dataset.map(format_conversation)
    return dataset


In [None]:
# File path to the jsonl file
file_path = '/kaggle/input/dataset/sample.jsonl'  # replace with your actual file path

# Load and format dataset
dataset = load_and_prepare_data(file_path)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [6]:
dataset

Dataset({
    features: ['messages', 'text'],
    num_rows: 140
})

In [None]:
# Load and Configure the Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
special_tokens = {'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>', '<|pad|>']}
tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = '<|pad|>'

# Load and Resize Model to Match Tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to('cuda')
model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50261, 1024)

In [None]:
# Tokenize Dataset
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=['messages', 'text'])
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Data Collator for Dynamic Padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [None]:
# Training Arguments

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    # eval_strategy= 'epoch',
    # save_strategy= 'epoch',
    # load_best_model_at_end=True,
    report_to="none",  # Set to "wandb" or "tensorboard" to log on those platforms
)

In [10]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 140
})

In [11]:
accelerator = Accelerator()

# Prepare model, tokenizer, and dataset for multi-GPU
model, tokenizer, tokenized_dataset = accelerator.prepare(
    model, tokenizer, tokenized_dataset
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)
torch.cuda.empty_cache()

In [None]:
# Start fine-tune
trainer.train()


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss


TrainOutput(global_step=51, training_loss=7.502890194163603, metrics={'train_runtime': 118.9797, 'train_samples_per_second': 3.53, 'train_steps_per_second': 0.429, 'total_flos': 378909883367424.0, 'train_loss': 7.502890194163603, 'epoch': 2.914285714285714})

In [None]:
# Save the Model and Tokenizer
model.save_pretrained("./gpt2_finetuned_20241108")
tokenizer.save_pretrained("./gpt2_finetuned_20241108")

In [None]:
# !zip -r /kaggle/working/gpt2_finetuned_20241107.zip gpt2_finetuned_20241107

In [None]:
# INFERENCE
model.eval()

# Check for GPU and move model to device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define Inference Function

def generate_response(messages, max_length=512, temperature=0.7, top_k=50, top_p=0.9):
    """
    Generates a response from the fine-tuned model based on a sequence of messages.

    Args:
    - messages (list): A list of dicts with `role` and `content` for each message.
    - max_length (int): Maximum number of tokens to generate.
    - temperature (float): Controls creativity; higher values make output more random.
    - top_k (int): Limits the sampling pool to the top K tokens.
    - top_p (float): Controls nucleus sampling; uses the smallest number of words with cumulative probability >= top_p.

    Returns:
    - str: The generated response from the assistant.

    """

    # Format the conversation into a single input string

    prompt = ""
    for message in messages:
        if message['role'] == 'system':
            prompt += f"<|system|> {message['content']} <|endoftext|>\n"
        elif message['role'] == 'user':
            prompt += f"<|user|> {message['content']} <|endoftext|>\n"
        elif message['role'] == 'assistant':
            prompt += f"<|assistant|> {message['content']} <|endoftext|>\n"
        
    print(prompt)
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    print("inputs token size:", len(inputs['input_ids'][0]))

    # Generate output using the model
    # https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig

    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            # num_beams=2,
            # early_stopping=True,
            # repetition_penalty=1.2,
            # length_penalty=0.9,
            # output_scores=True,
            # return_dict_in_generate=True,
        )


    # Decode the output to get the response text
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the assistant's response (after the last "<|assistant|>")
    # response_start = response_text.rfind("<|assistant|>") + len("<|assistant|>")
    # response = response_text[response_start:].strip()
    return response_text, inputs



# Example Conversation
# messages = [

    # {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},

    # {"role": "user", "content": "What's the capital of France?"},

#     {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."},

#     {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}

# ]

messages = [
    {"role": "system", "content": "Marv is a factual chatbot."},
    {"role": "user", "content": "What's the capital of France?"},
]



# Generate and print the assistant's response
response, inputs = generate_response(messages, temperature=0.2)
print("Assistant:", response)

<|system|> Marv is a factual chatbot. <|endoftext|>
<|user|> What's the capital of France? <|endoftext|>

inputs token size: 23
Assistant:  Marv is a factual chatbot. 
 What's the capital of France? 
 The capital of France is Paris. 
