In [1]:

!pip install transformers datasets torch
!pip install accelerate


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset


In [3]:
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Get filename dynamically
dataset_filename = list(uploaded.keys())[0]

# Load dataset
dataset = load_dataset("text", data_files={"train": dataset_filename})

# Display sample data
print(dataset["train"][0])


Saving dataset.txt to dataset.txt


Generating train split: 0 examples [00:00, ? examples/s]

{'text': '<!DOCTYPE html>'}


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # Define the padding token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/843 [00:00<?, ? examples/s]

In [8]:
model = GPT2LMHeadModel.from_pretrained("gpt2")


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    # Either set evaluation_strategy to "no" if you don't want to evaluate during training
    # evaluation_strategy="no",
    # Or, provide an eval_dataset if you want to evaluate during training
    evaluation_strategy="epoch",
    # eval_dataset=tokenized_datasets["train"], # Removed from TrainingArguments
    weight_decay=0.01,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"], # Added to Trainer arguments
)




In [14]:
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    model.to(inputs.input_ids.device)

    outputs = model.generate(**inputs, max_length=max_length, do_sample=True, top_k=50, top_p=0.95)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Try a sample prompt
print(generate_text("Once upon a time"))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, a man in the presence of the Lord began to question God about his desires, and, in so doing, brought down to him such a wrathful curse:

The Lord says: I will punish you all when you shall come into the presence of my people. The people of the Lord said:

"O Lord! we shall punish you, and with your curses come vengeance on you; we will not kill you, and we will not curse you.



In [15]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

!zip -r fine_tuned_gpt2.zip fine_tuned_gpt2
files.download("fine_tuned_gpt2.zip")


  adding: fine_tuned_gpt2/ (stored 0%)
  adding: fine_tuned_gpt2/model.safetensors (deflated 7%)
  adding: fine_tuned_gpt2/config.json (deflated 51%)
  adding: fine_tuned_gpt2/tokenizer_config.json (deflated 56%)
  adding: fine_tuned_gpt2/special_tokens_map.json (deflated 74%)
  adding: fine_tuned_gpt2/generation_config.json (deflated 24%)
  adding: fine_tuned_gpt2/vocab.json (deflated 68%)
  adding: fine_tuned_gpt2/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
!zip -r fine_tuned_gpt2.zip fine_tuned_gpt2


updating: fine_tuned_gpt2/ (stored 0%)
updating: fine_tuned_gpt2/model.safetensors (deflated 7%)
updating: fine_tuned_gpt2/config.json (deflated 51%)
updating: fine_tuned_gpt2/tokenizer_config.json (deflated 56%)
updating: fine_tuned_gpt2/special_tokens_map.json (deflated 74%)
updating: fine_tuned_gpt2/generation_config.json (deflated 24%)
updating: fine_tuned_gpt2/vocab.json (deflated 68%)
updating: fine_tuned_gpt2/merges.txt (deflated 53%)


In [17]:
from google.colab import files
files.download("fine_tuned_gpt2.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2") # Changed to the correct local path
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2") # Changed to the correct local path

# Generate text using the fine-tuned model
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a prompt