# **Install libraries**

In [4]:
!pip install transformers[torch] accelerate datasets



# **Import Libraries**

In [5]:
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

In [6]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


# **Load Dataset**

In [7]:
# Load your dataset (Ensure you have the dataset file uploaded to your Google Colab environment)
with open('/content/drive/MyDrive/Datasets/custom_dataset.txt', 'r', encoding='utf-8') as f:
  texts = f.readlines()

In [8]:
# Create a Dataset object
dataset = Dataset.from_dict({'text': texts})

In [9]:
# Reduce dataset size manually to avoid RAM issues
small_dataset = dataset.select(range(min(1000, len(dataset))))  # Use the first 1000 examples or less if dataset is smaller

In [10]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [11]:
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

In [12]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

In [13]:
tokenized_datasets = small_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [14]:
# Set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [15]:
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,  # Set to 1 epoch for demonstration
    per_device_train_batch_size=1,  # Reduce batch size to prevent RAM issues
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

In [17]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

In [18]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=5, training_loss=3.250331497192383, metrics={'train_runtime': 4.1096, 'train_samples_per_second': 1.217, 'train_steps_per_second': 1.217, 'total_flos': 326615040000.0, 'train_loss': 3.250331497192383, 'epoch': 1.0})

In [19]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

('./fine-tuned-gpt2/tokenizer_config.json',
 './fine-tuned-gpt2/special_tokens_map.json',
 './fine-tuned-gpt2/vocab.json',
 './fine-tuned-gpt2/merges.txt',
 './fine-tuned-gpt2/added_tokens.json')

In [20]:
# Generate text with the fine-tuned model
from transformers import pipeline

In [21]:
generator = pipeline('text-generation', model='./fine-tuned-gpt2', tokenizer=tokenizer)

In [22]:
prompt = "Once upon a time"
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
print(generated_text)

[{'generated_text': 'Once upon a time, in the days beginning with the beginning of the New World Wars, in Europe, under the protection of the royal family, an elite faction led by a mysterious figure ruled the world.\n\nThere were several reasons for the creation of the new world that are known only as the New World. The idea of a new world began in the early 19th century with the discovery of the ancient ruins of Rome and Venice, and the discovery of what may become the first industrial city in'}]
