In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import os
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU Model: Tesla P100-PCIE-16GB


In [2]:
os.environ["WANDB_DISABLED"] = "true"
model_name = "gpt2"
print(f"Loading {model_name} model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model = model.to(device)

Loading gpt2 model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
def load_and_tokenize_datasets(debug_mode=True):

    print("Loading CNN/DailyMail dataset...")
    summarization = load_dataset("giuliadc/cnndm-filtered")
    
    print("Loading SQuAD dataset...")
    qa = load_dataset("squad")
    
    if debug_mode:
        print("Debug mode: Using small subset of data")
        summarization = {
            'train': summarization['train'].select(range(100)),
            'validation': summarization['validation'].select(range(20))
        }
        qa = {
            'train': qa['train'].select(range(100)),
            'validation': qa['validation'].select(range(20))
        }
    
  
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    

    def prepare_summarization(examples):
        texts = [f"Article: {article}\nSummary: {summary}" 
                for article, summary in zip(examples['article'], examples['highlights'])]
        return tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    
   
    def prepare_qa(examples):
        texts = [f"Question: {question}\nContext: {context}\nAnswer: {answer['text'][0]}"
                for question, context, answer in zip(examples['question'], examples['context'], examples['answers'])]
        return tokenizer(texts, truncation=True, max_length=512, padding="max_length")
    
    print("Processing datasets...")
    tokenized_summ = summarization['train'].map(
        prepare_summarization,
        remove_columns=summarization['train'].column_names,
        batched=True
    )
    
    tokenized_qa = qa['train'].map(
        prepare_qa,
        remove_columns=qa['train'].column_names,
        batched=True
    )
    
   
    combined_dataset = concatenate_datasets([tokenized_summ, tokenized_qa])
    
    print(f"Total examples: {len(combined_dataset)}")
    return combined_dataset, tokenizer

In [9]:
combined_dataset, tokenizer = load_and_tokenize_datasets(debug_mode=True)  
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.pad_token_id = model.config.eos_token_id
model = model.to(device)

Loading CNN/DailyMail dataset...
Loading SQuAD dataset...
Debug mode: Using small subset of data




Processing datasets...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Total examples: 200


In [10]:
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=100,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
)

print("\nStarting training...")
trainer.train()

print("\nSaving model...")
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("\nTraining completed successfully!")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training...


Step,Training Loss



Saving model...

Training completed successfully!


In [12]:
import pandas as pd
import numpy as np

def evaluate_model(model, eval_dataset, tokenizer):
    eval_args = TrainingArguments(
        output_dir="./eval_results",
        do_train=False,
        do_eval=True,
        per_device_eval_batch_size=8,
        report_to="none"
    )
    
    evaluator = Trainer(
        model=model,
        args=eval_args,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
    )
    
    print("Running evaluation...")
    metrics = evaluator.evaluate()
    
    perplexity = np.exp(metrics['eval_loss'])
    cross_entropy = metrics['eval_loss']
    
    print("\nEvaluation Results:")
    print(f"Perplexity: {perplexity:.2f}")
    print(f"Cross Entropy Loss: {cross_entropy:.4f}")
    
    return perplexity, cross_entropy

print("Evaluating fine-tuned model...")
perplexity, cross_entropy = evaluate_model(model, combined_dataset, tokenizer)

results = {
    'Model': ['GPT (Fine-tuned)'],
    'Perplexity': [perplexity],
    'Cross Entropy Loss': [cross_entropy]
}

comparison_df = pd.DataFrame(results)
print("\nModel Comparison Results:")
print(comparison_df)

comparison_df.to_csv('model_comparison_results.csv', index=False)

Evaluating fine-tuned model...
Running evaluation...


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Evaluation Results:
Perplexity: 18.48
Cross Entropy Loss: 2.9166

Model Comparison Results:
              Model  Perplexity  Cross Entropy Loss
0  GPT (Fine-tuned)   18.477742            2.916567
