In [2]:
!git clone https://github.com/ankush357159/fusion-gpt.git

Cloning into 'fusion-gpt'...
remote: Enumerating objects: 278, done.[K
remote: Counting objects: 100% (278/278), done.[K
remote: Compressing objects: 100% (196/196), done.[K
remote: Total 278 (delta 124), reused 215 (delta 63), pack-reused 0 (from 0)[K
Receiving objects: 100% (278/278), 67.56 KiB | 1.09 MiB/s, done.
Resolving deltas: 100% (124/124), done.


In [22]:
%cd /content/fusion-gpt/picogpt
!git pull

/content/fusion-gpt/picogpt
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 7 (delta 6), reused 7 (delta 6), pack-reused 0 (from 0)[K
Unpacking objects: 100% (7/7), 623 bytes | 311.00 KiB/s, done.
From https://github.com/ankush357159/fusion-gpt
   27f1179..6ddcadc  main       -> origin/main
Updating 27f1179..6ddcadc
Fast-forward
 picogpt/data/raw/data.py    | 14 [32m+++++++[m[31m-------[m
 picogpt/test_generate.ipynb | 11 [32m+++++++++++[m
 2 files changed, 18 insertions(+), 7 deletions(-)


In [23]:
import sys
sys.path.append("/content/fusion-gpt/picogpt")

In [24]:
from data.raw.data import download_books, combine_books

# 1. Download the raw .txt files from Project Gutenberg
download_books()

# 2. Process, clean markers, and merge into 'combined_novels.txt'
input_books = combine_books()

Downloading books...
- Skipping gatsby (already downloaded)
Download complete

Cleaning and combining texts...
Dataset ready!
Saved to: /content/fusion-gpt/picogpt/data/raw/combined_novels.txt
Total characters: 271,227
Approx size: 0.27 MB


In [25]:
from pathlib import Path

combined_path = Path("/content/fusion-gpt/picogpt/data/raw/combined_novels.txt")

input_books = combined_path.read_text(encoding="utf-8")

print("Loaded characters:", len(input_books))
print(input_books[:500])  # preview


Loaded characters: 271227
The Great Gatsby
                                  by
                          F. Scott Fitzgerald


                           Table of Contents

I
II
III
IV
V
VI
VII
VIII
IX


                              Once again
                                  to
                                 Zelda


  Then wear the gold hat, if that will move her;
  If you can bounce high, bounce for her too,
  Till she cry “Lover, gold-hatted, high-bouncing lover,
  I must have you!”

  Thomas Parke d’Invilliers




In [26]:
# Tokenize + split into train/val/test
!python -m src.training.prepare_splits \
  --input_text /content/fusion-gpt/picogpt/data/raw/combined_novels.txt \
  --train_split 0.9 --val_split 0.05 --test_split 0.05 \
  --output_dir /content/fusion-gpt/picogpt/data/processed

Split complete:
- Train tokens: /content/fusion-gpt/picogpt/data/processed/train_tokens.npy (70,217)
- Val tokens:   /content/fusion-gpt/picogpt/data/processed/val_tokens.npy (3,900)
- Test tokens:  /content/fusion-gpt/picogpt/data/processed/test_tokens.npy (3,902)


In [27]:
import numpy as np
import os

train_tokens_path = '/content/fusion-gpt/picogpt/data/processed/train_tokens.npy'

# Check if the file exists
if os.path.exists(train_tokens_path):
    print(f"File '{train_tokens_path}' exists.")
    # Check file size
    file_size = os.path.getsize(train_tokens_path)
    print(f"File size: {file_size} bytes")

    # Attempt to load the file
    try:
        loaded_tokens = np.load(train_tokens_path)
        print(f"Successfully loaded '{train_tokens_path}'.")
        print(f"Shape of loaded tokens: {loaded_tokens.shape}")
        print(f"First 10 tokens: {loaded_tokens[:10]}")
    except EOFError:
        print(f"EOFError: '{train_tokens_path}' appears to be empty or corrupted.")
    except Exception as e:
        print(f"An error occurred while loading '{train_tokens_path}': {e}")
else:
    print(f"File '{train_tokens_path}' does not exist. This is unexpected.")

File '/content/fusion-gpt/picogpt/data/processed/train_tokens.npy' exists.
File size: 561864 bytes
Successfully loaded '/content/fusion-gpt/picogpt/data/processed/train_tokens.npy'.
Shape of loaded tokens: (70217,)
First 10 tokens: [ 464 3878  402 1381 1525  198  220  220  220  220]


In [28]:
# Train (uses pre-tokenized splits via train_tokens.npy)
!python -m src.training.train \
  --tokens_path /content/fusion-gpt/picogpt/data/processed/train_tokens.npy \
  --block_size 128 --batch_size 32 --epochs 1

Training PicoGPT from scratch
Total tokens: 70,217
Train tokens: 63,195
Val tokens:   3,511
Test tokens:  3,511
Epoch 1/1 [train]: 100% 1970/1970 [05:46<00:00,  5.68it/s, loss=2.8097]
Epoch 1/1 [val]: 100% 105/105 [00:06<00:00, 16.49it/s, loss=5.2361]
Epoch 1 | Train Loss: 3.6761 | Val Loss: 5.2074
Saved new best checkpoint.
Scratch training complete.


In [29]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [30]:
from configs.GPTConfig import GPTConfig
from src.tokenizer.tokenizer import TiktokenTokenizer

tokenizer = TiktokenTokenizer(encoding_name="gpt2")
config = GPTConfig(
    vocab_size=tokenizer.vocab_size,
    block_size=128,
    embed_dim=256,
    num_heads=8,
    num_layers=6,
    use_rope=False,
    gpt2_compatible=False,
)
config

<configs.GPTConfig.GPTConfig at 0x7ae59d72f6b0>

In [31]:
from src.model.pico_gpt import PicoGPT

model = PicoGPT(config).to(device)
model.block_size = config.block_size

checkpoint_path = "checkpoints/picogpt_best.pt"
state_dict = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(state_dict)
model.eval()
"Checkpoint loaded"

'Checkpoint loaded'

In [35]:
from src.inference.generate import generate

prompt = "Please generate short poem"
output = generate(model, tokenizer, prompt, max_new_tokens=100, device=device)
print(output)

Please generate short poem,
and you oughtnervous: “I killed a girl so it from get half
had care what kind of him—”

“I’m talk to ask too.”

“You know.”

“You’re a great people of interesting fellow’s been going to endure within Tom.”

“Was to get some time.”

“Wake me.


In [34]:
import torch

checkpoint_path = "checkpoints/picogpt_best.pt"
state_dict = torch.load(checkpoint_path, map_location='cpu') # Load to CPU to avoid CUDA issues if not available

print("Keys in the state_dict:")
for key in state_dict.keys():
    print(key)

print("\nSample weights (first parameter found):")
# Get the first key and its corresponding tensor
first_param_key = next(iter(state_dict))
first_param_tensor = state_dict[first_param_key]

print(f"Parameter name: {first_param_key}")
print(f"Shape: {first_param_tensor.shape}")
print(f"Sample values (first 10 elements):\n{first_param_tensor.flatten()[:10]}\n")

# Optionally, you can print sample weights from another specific layer, e.g., 'transformer.h.0.attn.c_attn.weight'
if 'transformer.h.0.attn.c_attn.weight' in state_dict:
    attn_weight = state_dict['transformer.h.0.attn.c_attn.weight']
    print(f"Parameter name: transformer.h.0.attn.c_attn.weight")
    print(f"Shape: {attn_weight.shape}")
    print(f"Sample values (first 10 elements):\n{attn_weight.flatten()[:10]}")

Keys in the state_dict:
embeddings.token_emb.weight
embeddings.pos_emb.weight
blocks.0.ln1.weight
blocks.0.ln1.bias
blocks.0.ln2.weight
blocks.0.ln2.bias
blocks.0.attn.mask
blocks.0.attn.qkv.weight
blocks.0.attn.qkv.bias
blocks.0.attn.proj.weight
blocks.0.attn.proj.bias
blocks.0.ff.net.0.weight
blocks.0.ff.net.0.bias
blocks.0.ff.net.2.weight
blocks.0.ff.net.2.bias
blocks.1.ln1.weight
blocks.1.ln1.bias
blocks.1.ln2.weight
blocks.1.ln2.bias
blocks.1.attn.mask
blocks.1.attn.qkv.weight
blocks.1.attn.qkv.bias
blocks.1.attn.proj.weight
blocks.1.attn.proj.bias
blocks.1.ff.net.0.weight
blocks.1.ff.net.0.bias
blocks.1.ff.net.2.weight
blocks.1.ff.net.2.bias
blocks.2.ln1.weight
blocks.2.ln1.bias
blocks.2.ln2.weight
blocks.2.ln2.bias
blocks.2.attn.mask
blocks.2.attn.qkv.weight
blocks.2.attn.qkv.bias
blocks.2.attn.proj.weight
blocks.2.attn.proj.bias
blocks.2.ff.net.0.weight
blocks.2.ff.net.0.bias
blocks.2.ff.net.2.weight
blocks.2.ff.net.2.bias
blocks.3.ln1.weight
blocks.3.ln1.bias
blocks.3.ln2.weig

This code loads the `state_dict` from the checkpoint file. It then prints all the keys (names of the parameters) available in the state dictionary. Finally, it demonstrates how to access and print the shape and a few sample values from the first parameter in the dictionary, and also for a specific attention layer weight if it exists.