In [6]:
# COLAB / LOCAL: Install dependencies (run in a notebook cell or shell)
!pip install -q "transformers>=4.30.0" datasets tokenizers "accelerate>=0.20.0" bitsandbytes sentencepiece safetensors huggingface_hub


In [7]:
!pip install -q "datasets" "tokenizers" "transformers" "accelerate" "safetensors"


In [8]:
!pip install -q datasets tokenizers transformers accelerate safetensors


In [9]:
import torch
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import (PreTrainedTokenizerFast,
                          GPT2Config, GPT2LMHeadModel,
                          DataCollatorForLanguageModeling,
                          TrainingArguments, Trainer)

# 1) Load WikiText-2 (works out-of-the-box)
#smaller dataset
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
print("Dataset size (lines):", len(ds))

# 2) Train a BPE(Byte Pair Encoding) tokenizer on a subset of Wiki lines
def train_tokenizer(dataset, output_path="tokenizer.json", vocab_size=20000, sample_size=10000):
    def iterator():
        for i, ex in enumerate(dataset):
            text = ex.get("text") or ""
            if text.strip():
                yield text
            if i >= sample_size - 1:
                break

    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size,
                                  special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"])
    tokenizer.train_from_iterator(iterator(), trainer=trainer)
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.save(output_path)
    return output_path

tk_path = train_tokenizer(ds, output_path="tokenizer.json")
print(f"Tokenizer saved to {tk_path}")

# Wraps our custom tokenizer into a Transformers-compatible object
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    unk_token="<unk>", bos_token="<bos>",
    eos_token="<eos>", pad_token="<pad>"
)
print("Vocab size:", tokenizer.vocab_size)

# 3) Tokenize and group into blocks
block_size = 256

#Tokenizes dataset into token IDs
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=False)
#Removes "text" column so only input_ids remain
tokenized = ds.map(tokenize_fn, batched=True, remove_columns=ds.column_names)

def group_texts(examples):
    all_ids = sum(examples["input_ids"], []) #Concatenates all tokens into one list
    total_len = (len(all_ids) // block_size) * block_size
    #Cuts them into non-overlapping chunks of block_size
    blocks = [all_ids[i : i + block_size] for i in range(0, total_len, block_size)]
    #Labels = same as inputs (because we’re predicting the next token)
    return {"input_ids": blocks, "labels": blocks.copy()}

lm_dataset = tokenized.map(group_texts, batched=True, remove_columns=tokenized.column_names) #Creates dataset ready for model training
print("Number of training blocks:", len(lm_dataset))

# 4) Creates a tiny GPT-2: Embedding size = 384 transformer layers attention heads Sequence length = 256 tokens
#Much smaller than real GPT-2 for faster training

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=384,
    n_layer=4,
    n_head=6,
)
model = GPT2LMHeadModel(config)

# 5) Training setup. Prepares batches for causal LM (not masked LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #mlm=False means predict next token, not fill in blanks

#Training hyperparameters: Batch size = 2 (very small to fit in memory) Accumulate gradients over 8 steps (effective batch = 16)
#Mixed precision (fp16) if GPU available
#Train for 5 epochs
#Save model after each epoch

training_args = TrainingArguments(
    output_dir="./mini_llama_wikitext2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=torch.cuda.is_available(),
    num_train_epochs=5,
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
)

#Trainer handles batching, optimization, checkpointing
#Actually trains your GPT-like model on WikiText-2
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

# 6) Generate sample. Tokenizes prompt and moves to GPU/CPU
#Uses sampling (top_k, top_p, temperature) to make text creative
#Decodes generated tokens to readable text

prompt = "In a distant future,"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
gen = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
)
print("Generated:", tokenizer.decode(gen[0], skip_special_tokens=True))


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset size (lines): 36718
Tokenizer saved to tokenizer.json
Vocab size: 20000


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Number of training blocks: 9537


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjaingaurav126[0m ([33mjaingaurav126-Shyam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,8.4872
200,7.3567
300,7.0346
400,6.8784
500,6.7627
600,6.6634
700,6.5568
800,6.4826
900,6.4268
1000,6.3859


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated: I n Ġa Ġdistant Ġfuture , Ġin Ġthe Ġher Ġa Ġ" Ġ. ĠĊ Ġ= Ġ= ĠĊ ĠA ĠWar Ġ, Ġ" Ġ, Ġ" Ġthat Ġthe Ġ" Ġin Ġhim Ġ, Ġ" Ġfrom Ġthe Ġ" Ġ, Ġwith Ġ" Ġ, Ġ, Ġ" Ġ, Ġwhich Ġit Ġit Ġand Ġ" Ġ... Ġ. ĠĊ Ġ" Ġwas Ġthe Ġ. Ġ" Ġ, Ġand Ġ" Ġ, Ġon Ġtheir Ġsaid Ġthat Ġ" Ġ, Ġwas Ġfrom Ġthe Ġ, Ġand ĠI Ġ, Ġand Ġof Ġit Ġwith Ġthe Ġ" Ġ. Ġ" Ġ, Ġ, Ġas Ġthat Ġ" Ġ, Ġhe Ġit Ġthe Ġ" Ġ. Ġ, Ġ" Ġ" Ġ" Ġ, Ġ" Ġand Ġ: Ġ" Ġ' t Ġ' s Ġ. Ġ" Ġ, Ġis Ġ"


In [10]:
tokenizer.save_pretrained("./my_model")
model.save_pretrained("./my_model")


In [11]:
from huggingface_hub import login

# This will ask for your token (from https://huggingface.co/settings/tokens)
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
from huggingface_hub import login, Repository
import shutil

# 1️⃣ Login to Hugging Face
login(token="")  # Replace with your token

# 2️⃣ Clone your model repo
repo = Repository(
    local_dir="my_model_repo",
    clone_from="RadhaShyam/my-mini-llama"  # Replace with your repo ID
)

# 3️⃣ Copy model + tokenizer files to the repo folder
files_to_copy = [
    "config.json",
    "generation_config.json",
    "model.safetensors",          # <-- updated from pytorch_model.bin
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json"
]

for file_name in files_to_copy:
    shutil.copy(f"my_model/{file_name}", "my_model_repo/")

# 4️⃣ Push to Hugging Face Hub
repo.push_to_hub(commit_message="Initial model upload")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/RadhaShyam/my-mini-llama into local empty directory.


Upload file model.safetensors:   0%|          | 1.00/56.8M [00:00<?, ?B/s]

To https://huggingface.co/RadhaShyam/my-mini-llama
   9791744..ef2e569  main -> main

   9791744..ef2e569  main -> main



'https://huggingface.co/RadhaShyam/my-mini-llama/commit/ef2e569afb35dc2c743d94a0cf7ff145c009513b'

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model from HF Hub
tokenizer = AutoTokenizer.from_pretrained("RadhaShyam/my-mini-llama")
model = AutoModelForCausalLM.from_pretrained("RadhaShyam/my-mini-llama")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Fix missing pad token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.eos_token_id

print("Type 'quit' to stop chatting.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == "quit":
        break

    # Chat prompt template
    prompt = f"User: {user_input}\nAssistant:"

    # Encode
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    # Generate
    outputs = model.generate(
        **inputs,
        max_length=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode & clean
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = text.split("Assistant:")[-1].strip()

    print(f"Bot: {reply}\n")


tokenizer_config.json:   0%|          | 0.00/973 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/764 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/59.5M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Type 'quit' to stop chatting.

You: Hi
Bot: U s er : ĠHi Ċ A ss istant : Ġ, Ġand Ġand Ġthat Ġthat Ġthat Ġthe Ġthe Ġof Ġthe Ġthe Ġ" Ġ, Ġ, Ġto Ġthe Ġthat Ġthe Ġ" Ġ. Ġ" Ġ" Ġ, Ġ" Ġthat Ġ" Ġ, Ġ" Ġ" Ġand Ġ" Ġ" Ġ, Ġ" Ġ. Ġ" Ġ, Ġ" Ġwas Ġ" Ġand Ġ" Ġ" Ġ, Ġ" Ġ, Ġand Ġ" Ġ" Ġ, Ġand Ġ" Ġ, Ġ" Ġ, Ġ, Ġ" Ġ, Ġand Ġthe Ġ" Ġand Ġ" Ġ, Ġ" Ġ, Ġ" Ġ, Ġ" Ġand Ġ" Ġ" Ġ" Ġ" Ġ, Ġthe Ġ" Ġand Ġ" Ġ, Ġ" Ġ" Ġ, Ġ" Ġ" Ġ" Ġ. Ġ" Ġ, Ġand Ġ" Ġ, Ġ" Ġ, Ġ" Ġ, Ġ" Ġ, Ġand Ġ" Ġ, Ġ" Ġ, Ġ" Ġ, Ġ" Ġ. Ġ" ĠĊ Ġ" Ġ, Ġ" Ġ. Ġ" ĠThe Ġ" Ġ, Ġ" Ġand Ġ" Ġ" Ġ, Ġ, Ġ" Ġ" Ġ, Ġ" Ġ, Ġ" Ġ, Ġ" Ġ" Ġ, Ġthat Ġ" Ġ, Ġ" Ġof Ġ" Ġ" Ġ, Ġand Ġ" Ġand Ġ" Ġ, Ġ" Ġ. Ġ" Ġ" Ġ" Ġ, Ġ" Ġand Ġ" Ġ, Ġ" Ġ. Ġ" Ġ" Ġ, Ġand Ġ" Ġ" Ġ, Ġ" Ġand Ġ" Ġ, Ġ" Ġ, Ġand Ġ" Ġ, Ġand Ġ" Ġand Ġ" Ġ, Ġ" Ġ. Ġ" Ġ, Ġ" Ġfor Ġthe Ġ" Ġ, Ġand Ġ"



KeyboardInterrupt: Interrupted by user

In [39]:
!rm -rf /root/.cache/huggingface

!rm -rf /content/*


In [None]:
#large datase (required 70 GB+ Space)

import torch
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import (
    PreTrainedTokenizerFast, GPT2Config, GPT2LMHeadModel,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)

# 1) Load a valid, available subset — e.g., Pile-CC
ds = load_dataset("ArmelR/the-pile-splitted", "Pile-CC", split="train")
print("Dataset size (lines):", len(ds))

# 2) Train tokenizer on a subset
def train_tokenizer(dataset, output_path="tokenizer.json", vocab_size=20000, sample_size=20000):
    def iterator():
        for i, ex in enumerate(dataset):
            text = ex.get("text") or ""
            if text.strip():
                yield text
            if i >= sample_size - 1:
                break

    tok = Tokenizer(models.BPE(unk_token="<unk>"))
    tok.pre_tokenizer = pre_tokenizers.ByteLevel()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size,
                                  special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"])
    tok.train_from_iterator(iterator(), trainer=trainer)
    tok.post_processor = processors.ByteLevel(trim_offsets=True)
    tok.save(output_path)
    return output_path

tk_path = train_tokenizer(ds, output_path="tokenizer.json")
print("Tokenizer saved to:", tk_path)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    unk_token="<unk>", bos_token="<bos>",
    eos_token="<eos>", pad_token="<pad>"
)
print("Vocab Size:", tokenizer.vocab_size)

# 3) Tokenize and group into blocks
block_size = 256

def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=False)

tokenized = ds.map(tokenize_fn, batched=True, remove_columns=ds.column_names)

def group_texts(examples):
    all_ids = sum(examples["input_ids"], [])
    total_len = (len(all_ids) // block_size) * block_size
    blocks = [all_ids[i:i + block_size] for i in range(0, total_len, block_size)]
    return {"input_ids": blocks, "labels": blocks.copy()}

lm_dataset = tokenized.map(group_texts, batched=True, remove_columns=tokenized.column_names)
print("Number of training blocks:", len(lm_dataset))

# 4) Build a small GPT-style model
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=block_size, n_ctx=block_size,
    n_embd=384, n_layer=4, n_head=6
)
model = GPT2LMHeadModel(config)

# 5) Training setup
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir="./mini_llama_pilecc",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=torch.cuda.is_available(),
    num_train_epochs=3,
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

# 6) Generate a sample
prompt = "In a distant future,"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
gen = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_k=50, top_p=0.95, temperature=0.9,
)
print("Generated:", tokenizer.decode(gen[0], skip_special_tokens=True))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>