In [3]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [15]:
# %%
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Preprocess text
def preprocess_text(text):
    text = re.sub(r"CHAPTER [IVXLCDM]+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"[^\w\s.,']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [63]:
with open("/content/combined_sherlock_holmes.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

clean_text = preprocess_text(raw_text)

# Save cleaned text
with open("sherlock_cleaned.txt", "w", encoding="utf-8") as file:
    file.write(clean_text)

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


In [58]:
# Tokenization function
def tokenize_function(example):
    tokenized_output = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    # Add labels as a copy of input_ids
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output


In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [19]:
# Train-test split
from sklearn.model_selection import train_test_split

train_texts, val_texts = train_test_split(sentences, test_size=0.2)

# Tokenize datasets
train_dataset = Dataset.from_dict({"text": train_texts}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({"text": val_texts}).map(tokenize_function, batched=True)

# Converts train_texts and val_texts into Hugging Face Dataset objects.
# Applies the tokenize_function (defined earlier) to each text entry:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,  # Regularization to prevent overfitting.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


# Train the model
trainer.train()


Map:   0%|          | 0/14809 [00:00<?, ? examples/s]

Map:   0%|          | 0/3703 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.5587,0.544349
2,0.4952,0.53806
3,0.4703,0.540545


TrainOutput(global_step=5556, training_loss=0.520322955538679, metrics={'train_runtime': 1970.6792, 'train_samples_per_second': 22.544, 'train_steps_per_second': 2.819, 'total_flos': 2902105276416000.0, 'train_loss': 0.520322955538679, 'epoch': 3.0})

In [64]:
# Evaluation metrics
def top_k_accuracy(model, tokenizer, test_sequences, k=10, max_sequence_len=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    correct = 0
    total = 0

    for sequence in test_sequences:
        tokenized_sequence = tokenizer(sequence, return_tensors="pt", max_length=max_sequence_len, truncation=True)
        input_ids = tokenized_sequence["input_ids"].to(device)

        with torch.no_grad():
            outputs = model(input_ids)
            predictions = outputs.logits[0, -1]

        top_k_preds = predictions.topk(k).indices.tolist()
        true_word_id = input_ids[0, -1].item()

        if true_word_id in top_k_preds:
            correct += 1
        total += 1

    return correct / total if total > 0 else 0


In [65]:
def calculate_bleu(model, tokenizer, test_sequences, max_sequence_len=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    bleu_scores = []
    smoothing_function = SmoothingFunction().method1

    for sequence in test_sequences:
        tokenized_sequence = tokenizer(sequence, return_tensors="pt", max_length=max_sequence_len, truncation=True)
        input_ids = tokenized_sequence["input_ids"].to(device)

        with torch.no_grad():
            outputs = model(input_ids)
            predicted_token_id = outputs.logits[0, -1].argmax().item()

        predicted_token = tokenizer.decode([predicted_token_id]).lower().strip()
        true_token = tokenizer.decode([input_ids[0, -1].item()]).lower().strip()

        bleu_score = sentence_bleu([true_token.split()], predicted_token.split(), smoothing_function=smoothing_function)
        bleu_scores.append(bleu_score)

    return np.mean(bleu_scores)


In [66]:
def calculate_perplexity(model, tokenizer, test_sequences, max_sequence_len=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    total_loss = 0
    total_tokens = 0

    for sequence in test_sequences:
        tokenized_sequence = tokenizer(sequence, return_tensors="pt", max_length=max_sequence_len, truncation=True)
        input_ids = tokenized_sequence["input_ids"].to(device)
        attention_mask = tokenized_sequence["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

        total_loss += loss.item() * input_ids.size(1)
        total_tokens += input_ids.size(1)

    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)
    return perplexity

In [68]:
test_sequences = clean_text[20000:21000]
print(f"Top-3 Accuracy: {top_k_accuracy(model, tokenizer, test_sequences):.2f}")
print(f"BLEU Score: {calculate_bleu(model, tokenizer, test_sequences):.2f}")
print(f"Perplexity: {calculate_perplexity(model, tokenizer, test_sequences):.2f}")

# Save the model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

# Load the model for inference
from transformers import pipeline

text_generator = pipeline("text-generation", model="./gpt2-finetuned", tokenizer="./gpt2-finetuned")
print(text_generator("Sherlock Holmes was a detective who", max_length=50))


Top-3 Accuracy: 0.02
BLEU Score: 0.00
Perplexity: nan


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Sherlock Holmes was a detective who investigated the disappearance of a woman who came to live in an isolated community and is now an active member of the crime scene team of a crime scene unit in London.\n\nIn 2010 Holmes was charged with attempting to'}]
