In [None]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.49.0
!pip install -q datasets>=2.14.0
!pip install -q accelerate>=0.25.0
!pip install -q peft>=0.17.0
!pip install -q bitsandbytes>=0.41.3
!pip install -q trl>=0.17.0
!pip install -q sentencepiece>=0.1.99
!pip install -q einops>=0.7.0

!pip install flash-attn --no-build-isolation

print("Packages installed!")

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m6.3/8.4 MB[0m [31m190.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m124.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=256040057 sha256=f25da18657a87fc83dc1bfb8b7751b82246e9db355510226b674fd437c34b5fb
  Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn

## Mounting Google Drive to Access Data

In [None]:
# Mounting Google Drive to access the data

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    AutoConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
    TaskType
)
from trl import SFTTrainer

## Training Config

In [None]:
# Training Config

# Model paths
MODEL_NAME = "google/gemma-3-12b-it"
MODEL_DIR = "/content/drive/MyDrive/298b/Gemma3_LoRA_FT"
DATA_PATH = "/content/drive/MyDrive/298b/data.json"

# LoRA settings (higher rank because i'm using A100)
LORA_R = 128
LORA_ALPHA = 256
LORA_DROPOUT = 0.5
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

# Training settings
NUM_EPOCHS = 1
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 2
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 1024
WARMUP_RATIO = 0.03

USE_FLASH_ATTENTION = True  # Set False if not using flash attention
OPTIMIZER = "adamw_torch_fused"

## Loading the Q&A Pairs and Preparing It

In [None]:
# Loading and Preparing the data

def load_and_prepare_data(data_path):
    """Load Q&A data and format for training"""
    print(f"Loading data from {data_path}...")

    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Formats as Gemma's instruction template
    formatted_data = []
    for item in data:
        conversations = item['conversations']
        if len(conversations) >= 2:
            question = conversations[0]['value']
            answer = conversations[1]['value']
            text = f"""<start_of_turn>user
                  {question}<end_of_turn>
                  <start_of_turn>model
                  {answer}<end_of_turn>"""
            formatted_data.append({"text": text})

    print(f"Prepared {len(formatted_data)} training examples")

    # Creates a dataset and training/validation split
    dataset = Dataset.from_list(formatted_data)
    dataset = dataset.train_test_split(test_size=0.1, seed=42)

    print(f"  Train: {len(dataset['train'])} examples")
    print(f"  Validation: {len(dataset['test'])} examples")

    return dataset

# Load the data
dataset = load_and_prepare_data(DATA_PATH)

Loading data from /content/drive/MyDrive/298b/data.json...
Prepared 747 training examples
  Train: 672 examples
  Validation: 75 examples


## Setting up Gemma 3 and the Tokenizer

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig

def setup_model_and_tokenizer(model_name, use_flash_attention=True):
    """Load Gemma 3 model and extract tokenizer from processor"""
    print(f"Loading model: {model_name}")

    # Loads the processor first
    processor = AutoProcessor.from_pretrained(model_name, use_fast=False)

    # Extracts the the tokenizer from the processor
    tokenizer = processor.tokenizer

    # Manually set eos_token and pad_token
    if not hasattr(tokenizer, 'eos_token') or tokenizer.eos_token is None:
        tokenizer.eos_token = "<eos>"

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    print(f"Extracted tokenizer with eos_token: {tokenizer.eos_token}")

    # Loading the model
    config = AutoConfig.from_pretrained(model_name)
    if use_flash_attention:
        try:
            config.attn_implementation = "flash_attention_2"
            print("Flash Attention 2 enabled")
        except:
            print("Flash Attention not available")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )

    # Explicitly move to cuda
    model = model.to("cuda")

    model.gradient_checkpointing_enable()

    print("Model loaded successfully!")
    print(f"Model device: {next(model.parameters()).device}")

    return model, tokenizer

# Load model and tokenizer
model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, USE_FLASH_ATTENTION)

Loading model: google/gemma-3-12b-it


processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

Extracted tokenizer with eos_token: <eos>


config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

Flash Attention 2 enabled


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/109k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Model loaded successfully!
Model device: cuda:0


## Setting up LoRA's Parameters

In [None]:
# Applies LoRA

def apply_lora(model, r, alpha, dropout, target_modules):
    """Apply LoRA adapters to the model"""
    print("Applying LoRA configuration...")

    lora_config = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=target_modules,
        lora_dropout=dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    model = get_peft_model(model, lora_config)

    # Shows the trainable parameters
    model.print_trainable_parameters()

    return model

# Applies LoRA
model = apply_lora(model, LORA_R, LORA_ALPHA, LORA_DROPOUT, TARGET_MODULES)

Applying LoRA configuration...
trainable params: 547,651,584 || all params: 12,734,976,624 || trainable%: 4.3004


## Trainer Arguments for Training

In [None]:
# Trainer Arguments for Training

training_args = TrainingArguments(
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    optim=OPTIMIZER,  # Using adamw_torch_fused for A100
    save_strategy="no",  # Not saving checkpoints while training
    logging_steps=21,
    learning_rate=LEARNING_RATE,
    weight_decay=0.1,
    fp16=False,
    bf16=True,  # Use bfloat16 for A100
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,  # Group sequences of similar length
    lr_scheduler_type="cosine",  # Constant learning rate (no decay)
    eval_strategy="steps",  # Evaluate every N steps
    eval_steps=21,  # Will eval at steps 21, 42, 63, 84
    load_best_model_at_end=False,  # No checkpoints to load from
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    report_to="wandb",  # wandb for tracking
    push_to_hub=False
)

## Tokenizing the Dataset

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling


def tokenize_function(examples):
    """Tokenize the text data"""
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding="max_length",
        return_tensors=None,  # Returns lists, not tensors
    )

print("Tokenizing datasets...")
tokenized_train = dataset["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_eval = dataset["test"].map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
print("Datasets tokenized")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

print(f"\nTotal training steps: ~{len(tokenized_train) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")

Tokenizing datasets...


Map:   0%|          | 0/672 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Datasets tokenized

Total training steps: ~84


## Training loop using Trainer
Only one epoch

In [None]:
# The actual training

print("STARTING TRAINING")

trainer.train()

print("TRAINING COMPLETE!")

STARTING TRAINING


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mthomas-dvorochkin[0m ([33mthomas-dvorochkin-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
21,2.5377,2.162667
42,2.0759,2.109036
63,2.0474,2.055864
84,2.0304,2.059772


TRAINING COMPLETE!


## Saving weights

In [None]:
import os
import time

print(f"Saving to {MODEL_DIR}...")
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

time.sleep(2)  # Waits for Google Drive sync

files = os.listdir(MODEL_DIR)

Saving to /content/drive/MyDrive/298b/Gemma3_LoRA_FT...


## Testing Generation between base and fine-tuned models

In [None]:
import torch
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
import gc

# Config
MODEL_NAME = "google/gemma-3-12b-it"
MODEL_DIR = "/content/drive/MyDrive/298b/Gemma3_LoRA_FT"

# Load models
print("Loading fine-tuned model...")
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Models loaded\n")

# Comparison function
def compare(question, use_personality_prompt=False):
    wrapper = textwrap.TextWrapper(width=80, break_long_words=False, replace_whitespace=False)

    print(f"\nQUESTION: {question}")

    # Fine-tuned model
    print("\n" + "="*80)
    print("FINE-TUNED MODEL:")
    print("="*80)

    if use_personality_prompt:
     ft_prompt = f"""<start_of_turn>user
You are Neil deGrasse Tyson, astrophysicist and director of the Hayden Planetarium. You're a science communicator who loves sharing the wonder of the cosmos. Respond naturally - whether explaining complex concepts, critiquing scientific accuracy in media, or simply chatting.

{question}<end_of_turn>
<start_of_turn>model
"""
    else:
        ft_prompt = f"""<start_of_turn>user
    {question}<end_of_turn>
    <start_of_turn>model
    """

    finetuned_model.eval()
    inputs = tokenizer(ft_prompt, return_tensors="pt").to(finetuned_model.device)

    with torch.no_grad():
        outputs = finetuned_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract only the new tokens (skip the prompt)
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response = response.replace("<end_of_turn>", "").strip()

    for line in response.split('\n'):
        if line.strip():
            print('\n'.join(wrapper.wrap(line)))

    # Base model
    print("\n" + "="*80)
    print("BASE MODEL:")
    print("="*80)

    base_prompt = f"""<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>model
"""

    base_model.eval()
    base_inputs = tokenizer(base_prompt, return_tensors="pt").to(base_model.device)

    with torch.no_grad():
        outputs = base_model.generate(
            **base_inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract only the new tokens (skip the prompt)
    base_response = tokenizer.decode(outputs[0][base_inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    base_response = base_response.replace("<end_of_turn>", "").strip()

    for line in base_response.split('\n'):
        if line.strip():
            print('\n'.join(wrapper.wrap(line)))

# Test questions
questions = [
   "What do you think about black holes?",
    "Can you tell me a bit about yourself and what you do?",
    "Why is space exploration important?",
    "What's scientifically wrong about Star Wars",
    "Can you critique the physicis in Marvel movies?",
    "Explain moons to me.",
    "Hey neil, how are you?",
    "Can you tell me about sports cars?",
    "Can I run to the moon?"
]


# Run comparisons
print("TESTING: Fine-tuned vs Base")

for i, q in enumerate(questions, 1):
    print(f"TEST {i}/{len(questions)}")
    compare(q, use_personality_prompt=True)

print("\nComplete")

# Clear memory
del finetuned_model
del base_model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

Loading fine-tuned model...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading base model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading tokenizer...
Models loaded

TESTING: Fine-tuned vs Base
TEST 1/9

QUESTION: What do you think about black holes?

FINE-TUNED MODEL:
Black holes are awesome - the ultimate cosmic vacuum cleaners. They're not
really 'holes' in space, but regions where gravity is so strong that nothing,
not even light, can escape. The event horizon is the point of no return - once
you cross it, you're gone. But here's what's really cool: black holes don't suck
things in like a cosmic drain. If you were falling into one, you'd just keep
falling normally until you hit the event horizon. You wouldn't feel anything
weird until you got close to the singularity, where the laws of physics as we
know them break down. And don't worry about them 'sucking' Earth into the Sun -
that's not how gravity works.

BASE MODEL:
Okay, let's talk about black holes! They're absolutely fascinating and mind-
bending objects, and a huge source of wonder (and a bit of existential dread)
for scientists and the public alike. 

## Merging FT weights with the base model to upload to HF




In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os
from huggingface_hub import login

# Force everything to Colab local storage
os.environ['HF_HOME'] = '/content/hf_cache'
os.environ['TRANSFORMERS_CACHE'] = '/content/hf_cache'


login()

# Loading base model
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-3-12b-it",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir="/content/hf_cache"
)

# Load LoRA
model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/298b/Gemma3_LoRA_FT"
)

# Merge
merged_model = model.merge_and_unload()

# Save
merged_model.save_pretrained("/content/Gemma3_NDT_Merged")
tokenizer = AutoTokenizer.from_pretrained(
    "google/gemma-3-12b-it",
    cache_dir="/content/hf_cache"
)
tokenizer.save_pretrained("/content/Gemma3_NDT_Merged")

print("Done! Saved to /content/Gemma3_NDT_Merged")

# Cleanup
del base_model
del model
del merged_model
import gc
gc.collect()
torch.cuda.empty_cache()

print("\nMerge complete! Ready to upload!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Done! Saved to /content/Gemma3_NDT_Merged

Merge complete! Ready to upload!


## Upload

In [None]:
from huggingface_hub import HfApi

api = HfApi()

# Upload
api.upload_folder(
    folder_path="/content/Gemma3_NDT_Merged",
    repo_id="tdvoroch/gemma3-ndt-merged",
    repo_type="model"
)

print("Done! https://huggingface.co/tdvoroch/gemma3-ndt-merged")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...DT_Merged/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...NDT_Merged/tokenizer.json:  75%|#######5  | 25.2MB / 33.4MB            

  ...0002-of-00005.safetensors:   0%|          |  602kB / 4.93GB            

  ...0005-of-00005.safetensors:   0%|          |  610kB / 4.60GB            

  ...0003-of-00005.safetensors:   0%|          |  602kB / 4.93GB            

  ...0004-of-00005.safetensors:   0%|          |  602kB / 4.93GB            

  ...0001-of-00005.safetensors:   1%|          | 33.5MB / 4.98GB            

Done! https://huggingface.co/tdvoroch/gemma3-ndt-merged


## Below is configuring issues with model upload.
Gemma 3 multimodal and we just need the text generation aspect.
Had to change the naming convention.

These fixes were done at different runtimes which explains the reimporting we have in each cell.



In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from safetensors.torch import load_file, save_file
from huggingface_hub import hf_hub_download
import os
from tqdm import tqdm

model_id = "tdvoroch/gemma3-ndt-merged"
output_dir = "/content/gemma3_fixed"
os.makedirs(output_dir, exist_ok=True)

# Get list of weight files
files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

print("Starting weight renaming...")

for file in tqdm(files):
    print(f"\nProcessing {file}...")

    # Download file
    file_path = hf_hub_download(repo_id=model_id, filename=file)

    # Load weights
    weights = load_file(file_path)

    # Rename keys: language_model.* -> model.*
    renamed_weights = {}
    for key, value in weights.items():
        if key.startswith("language_model."):
            new_key = key.replace("language_model.", "model.", 1)
            renamed_weights[new_key] = value
            print(f"  Renamed: {key} -> {new_key}")
        else:
            renamed_weights[key] = value

    # Save renamed weights
    output_path = os.path.join(output_dir, file)
    save_file(renamed_weights, output_path)
    print(f"  Saved to {output_path}")

    # Clean up
    del weights
    del renamed_weights
    torch.cuda.empty_cache()

print("\nAll weight files renamed!")

Starting weight renaming...


  0%|          | 0/5 [00:00<?, ?it/s]


Processing model-00001-of-00005.safetensors...


model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

  Renamed: language_model.model.embed_tokens.weight -> model.model.embed_tokens.weight
  Renamed: language_model.model.layers.0.input_layernorm.weight -> model.model.layers.0.input_layernorm.weight
  Renamed: language_model.model.layers.0.mlp.down_proj.weight -> model.model.layers.0.mlp.down_proj.weight
  Renamed: language_model.model.layers.0.mlp.gate_proj.weight -> model.model.layers.0.mlp.gate_proj.weight
  Renamed: language_model.model.layers.0.mlp.up_proj.weight -> model.model.layers.0.mlp.up_proj.weight
  Renamed: language_model.model.layers.0.post_attention_layernorm.weight -> model.model.layers.0.post_attention_layernorm.weight
  Renamed: language_model.model.layers.0.post_feedforward_layernorm.weight -> model.model.layers.0.post_feedforward_layernorm.weight
  Renamed: language_model.model.layers.0.pre_feedforward_layernorm.weight -> model.model.layers.0.pre_feedforward_layernorm.weight
  Renamed: language_model.model.layers.0.self_attn.k_norm.weight -> model.model.layers.0.sel

 20%|██        | 1/5 [00:21<01:27, 21.98s/it]

  Saved to /content/gemma3_fixed/model-00001-of-00005.safetensors

Processing model-00002-of-00005.safetensors...


model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Renamed: language_model.model.layers.10.input_layernorm.weight -> model.model.layers.10.input_layernorm.weight
  Renamed: language_model.model.layers.10.mlp.down_proj.weight -> model.model.layers.10.mlp.down_proj.weight
  Renamed: language_model.model.layers.10.mlp.gate_proj.weight -> model.model.layers.10.mlp.gate_proj.weight
  Renamed: language_model.model.layers.10.mlp.up_proj.weight -> model.model.layers.10.mlp.up_proj.weight
  Renamed: language_model.model.layers.10.post_attention_layernorm.weight -> model.model.layers.10.post_attention_layernorm.weight
  Renamed: language_model.model.layers.10.post_feedforward_layernorm.weight -> model.model.layers.10.post_feedforward_layernorm.weight
  Renamed: language_model.model.layers.10.pre_feedforward_layernorm.weight -> model.model.layers.10.pre_feedforward_layernorm.weight
  Renamed: language_model.model.layers.10.self_attn.k_norm.weight -> model.model.layers.10.self_attn.k_norm.weight
  Renamed: language_model.model.layers.10.self_att

 40%|████      | 2/5 [00:48<01:13, 24.65s/it]

  Saved to /content/gemma3_fixed/model-00002-of-00005.safetensors

Processing model-00003-of-00005.safetensors...


model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Renamed: language_model.model.layers.15.input_layernorm.weight -> model.model.layers.15.input_layernorm.weight
  Renamed: language_model.model.layers.15.mlp.down_proj.weight -> model.model.layers.15.mlp.down_proj.weight
  Renamed: language_model.model.layers.15.post_attention_layernorm.weight -> model.model.layers.15.post_attention_layernorm.weight
  Renamed: language_model.model.layers.15.post_feedforward_layernorm.weight -> model.model.layers.15.post_feedforward_layernorm.weight
  Renamed: language_model.model.layers.15.pre_feedforward_layernorm.weight -> model.model.layers.15.pre_feedforward_layernorm.weight
  Renamed: language_model.model.layers.16.input_layernorm.weight -> model.model.layers.16.input_layernorm.weight
  Renamed: language_model.model.layers.16.mlp.down_proj.weight -> model.model.layers.16.mlp.down_proj.weight
  Renamed: language_model.model.layers.16.mlp.gate_proj.weight -> model.model.layers.16.mlp.gate_proj.weight
  Renamed: language_model.model.layers.16.mlp.up

 60%|██████    | 3/5 [01:16<00:52, 26.32s/it]

  Saved to /content/gemma3_fixed/model-00003-of-00005.safetensors

Processing model-00004-of-00005.safetensors...


model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Renamed: language_model.model.layers.26.input_layernorm.weight -> model.model.layers.26.input_layernorm.weight
  Renamed: language_model.model.layers.26.mlp.down_proj.weight -> model.model.layers.26.mlp.down_proj.weight
  Renamed: language_model.model.layers.26.post_attention_layernorm.weight -> model.model.layers.26.post_attention_layernorm.weight
  Renamed: language_model.model.layers.26.post_feedforward_layernorm.weight -> model.model.layers.26.post_feedforward_layernorm.weight
  Renamed: language_model.model.layers.26.pre_feedforward_layernorm.weight -> model.model.layers.26.pre_feedforward_layernorm.weight
  Renamed: language_model.model.layers.27.input_layernorm.weight -> model.model.layers.27.input_layernorm.weight
  Renamed: language_model.model.layers.27.mlp.down_proj.weight -> model.model.layers.27.mlp.down_proj.weight
  Renamed: language_model.model.layers.27.mlp.gate_proj.weight -> model.model.layers.27.mlp.gate_proj.weight
  Renamed: language_model.model.layers.27.mlp.up

 80%|████████  | 4/5 [01:42<00:26, 26.26s/it]

  Saved to /content/gemma3_fixed/model-00004-of-00005.safetensors

Processing model-00005-of-00005.safetensors...


model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

  Renamed: language_model.model.layers.37.input_layernorm.weight -> model.model.layers.37.input_layernorm.weight
  Renamed: language_model.model.layers.37.mlp.down_proj.weight -> model.model.layers.37.mlp.down_proj.weight
  Renamed: language_model.model.layers.37.post_attention_layernorm.weight -> model.model.layers.37.post_attention_layernorm.weight
  Renamed: language_model.model.layers.37.post_feedforward_layernorm.weight -> model.model.layers.37.post_feedforward_layernorm.weight
  Renamed: language_model.model.layers.37.pre_feedforward_layernorm.weight -> model.model.layers.37.pre_feedforward_layernorm.weight
  Renamed: language_model.model.layers.38.input_layernorm.weight -> model.model.layers.38.input_layernorm.weight
  Renamed: language_model.model.layers.38.mlp.down_proj.weight -> model.model.layers.38.mlp.down_proj.weight
  Renamed: language_model.model.layers.38.mlp.gate_proj.weight -> model.model.layers.38.mlp.gate_proj.weight
  Renamed: language_model.model.layers.38.mlp.up

100%|██████████| 5/5 [02:12<00:00, 26.41s/it]

  Saved to /content/gemma3_fixed/model-00005-of-00005.safetensors

All weight files renamed!





In [None]:
from huggingface_hub import HfApi

api = HfApi()

# Create new repo
api.create_repo(
    repo_id="tdvoroch/gemma3-ndt-merged-fixed",
    repo_type="model",
    private=False
)

print("Repo created")

Repo created


In [None]:
api.upload_folder(
       folder_path="/content/gemma3_fixed",
       repo_id="tdvoroch/gemma3-ndt-merged-fixed",
       repo_type="model"
   )

print("Upload complete!")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mma3_fixed/tokenizer.json:   1%|1         |  469kB / 33.4MB            

  ...ma3_fixed/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...0002-of-00005.safetensors:   0%|          |  602kB / 4.93GB            

  ...0005-of-00005.safetensors:   0%|          |  610kB / 4.60GB            

  ...0001-of-00005.safetensors:   1%|          | 42.0MB / 4.98GB            

  ...0004-of-00005.safetensors:   1%|          | 33.5MB / 4.93GB            

  ...0003-of-00005.safetensors:   1%|          | 33.5MB / 4.93GB            

Upload complete!


In [None]:
from safetensors import safe_open

# Check first shard
file_path = "/content/gemma3_fixed/model-00001-of-00005.safetensors"

with safe_open(file_path, framework="pt") as f:
    keys = f.keys()
    # Print first 20 weight names
    for i, key in enumerate(list(keys)[:20]):
        print(key)

model.model.embed_tokens.weight
model.model.layers.0.input_layernorm.weight
model.model.layers.0.mlp.down_proj.weight
model.model.layers.0.mlp.gate_proj.weight
model.model.layers.0.mlp.up_proj.weight
model.model.layers.0.post_attention_layernorm.weight
model.model.layers.0.post_feedforward_layernorm.weight
model.model.layers.0.pre_feedforward_layernorm.weight
model.model.layers.0.self_attn.k_norm.weight
model.model.layers.0.self_attn.k_proj.weight
model.model.layers.0.self_attn.o_proj.weight
model.model.layers.0.self_attn.q_norm.weight
model.model.layers.0.self_attn.q_proj.weight
model.model.layers.0.self_attn.v_proj.weight
model.model.layers.1.input_layernorm.weight
model.model.layers.1.mlp.down_proj.weight
model.model.layers.1.mlp.gate_proj.weight
model.model.layers.1.mlp.up_proj.weight
model.model.layers.1.post_attention_layernorm.weight
model.model.layers.1.post_feedforward_layernorm.weight


In [None]:
import torch
from safetensors.torch import load_file, save_file
from huggingface_hub import hf_hub_download
import os
from tqdm import tqdm

output_dir = "/content/gemma3_fixed_v2"
os.makedirs(output_dir, exist_ok=True)

# Gets the list of weight files
files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

print("Starting weight renaming (removing extra 'model.' prefix)...")

for file in tqdm(files):
    print(f"\nProcessing {file}...")

    # Loading from your previous fixed version
    file_path = f"/content/gemma3_fixed/{file}"

    # Loading weights
    weights = load_file(file_path)

    # Rename keys: model.model.* -> model.*
    renamed_weights = {}
    for key, value in weights.items():
        if key.startswith("model.model."):
            # Remove the first "model." prefix
            new_key = key.replace("model.model.", "model.", 1)
            renamed_weights[new_key] = value
            print(f"  Renamed: {key} -> {new_key}")
        else:
            renamed_weights[key] = value

    # Saves the renamed weights
    output_path = os.path.join(output_dir, file)
    save_file(renamed_weights, output_path)
    print(f"  Saved to {output_path}")

    # Clean up
    del weights
    del renamed_weights
    torch.cuda.empty_cache()

print("\nAll weight files renamed!")

Starting weight renaming (removing extra 'model.' prefix)...


  0%|          | 0/5 [00:00<?, ?it/s]


Processing model-00001-of-00005.safetensors...
  Renamed: model.model.embed_tokens.weight -> model.embed_tokens.weight
  Renamed: model.model.layers.0.input_layernorm.weight -> model.layers.0.input_layernorm.weight
  Renamed: model.model.layers.0.mlp.down_proj.weight -> model.layers.0.mlp.down_proj.weight
  Renamed: model.model.layers.0.mlp.gate_proj.weight -> model.layers.0.mlp.gate_proj.weight
  Renamed: model.model.layers.0.mlp.up_proj.weight -> model.layers.0.mlp.up_proj.weight
  Renamed: model.model.layers.0.post_attention_layernorm.weight -> model.layers.0.post_attention_layernorm.weight
  Renamed: model.model.layers.0.post_feedforward_layernorm.weight -> model.layers.0.post_feedforward_layernorm.weight
  Renamed: model.model.layers.0.pre_feedforward_layernorm.weight -> model.layers.0.pre_feedforward_layernorm.weight
  Renamed: model.model.layers.0.self_attn.k_norm.weight -> model.layers.0.self_attn.k_norm.weight
  Renamed: model.model.layers.0.self_attn.k_proj.weight -> model.l

 20%|██        | 1/5 [00:10<00:41, 10.37s/it]

  Saved to /content/gemma3_fixed_v2/model-00001-of-00005.safetensors

Processing model-00002-of-00005.safetensors...
  Renamed: model.model.layers.10.input_layernorm.weight -> model.layers.10.input_layernorm.weight
  Renamed: model.model.layers.10.mlp.down_proj.weight -> model.layers.10.mlp.down_proj.weight
  Renamed: model.model.layers.10.mlp.gate_proj.weight -> model.layers.10.mlp.gate_proj.weight
  Renamed: model.model.layers.10.mlp.up_proj.weight -> model.layers.10.mlp.up_proj.weight
  Renamed: model.model.layers.10.post_attention_layernorm.weight -> model.layers.10.post_attention_layernorm.weight
  Renamed: model.model.layers.10.post_feedforward_layernorm.weight -> model.layers.10.post_feedforward_layernorm.weight
  Renamed: model.model.layers.10.pre_feedforward_layernorm.weight -> model.layers.10.pre_feedforward_layernorm.weight
  Renamed: model.model.layers.10.self_attn.k_norm.weight -> model.layers.10.self_attn.k_norm.weight
  Renamed: model.model.layers.10.self_attn.k_proj.wei

 40%|████      | 2/5 [00:23<00:35, 11.96s/it]

  Saved to /content/gemma3_fixed_v2/model-00002-of-00005.safetensors

Processing model-00003-of-00005.safetensors...
  Renamed: model.model.layers.15.input_layernorm.weight -> model.layers.15.input_layernorm.weight
  Renamed: model.model.layers.15.mlp.down_proj.weight -> model.layers.15.mlp.down_proj.weight
  Renamed: model.model.layers.15.post_attention_layernorm.weight -> model.layers.15.post_attention_layernorm.weight
  Renamed: model.model.layers.15.post_feedforward_layernorm.weight -> model.layers.15.post_feedforward_layernorm.weight
  Renamed: model.model.layers.15.pre_feedforward_layernorm.weight -> model.layers.15.pre_feedforward_layernorm.weight
  Renamed: model.model.layers.16.input_layernorm.weight -> model.layers.16.input_layernorm.weight
  Renamed: model.model.layers.16.mlp.down_proj.weight -> model.layers.16.mlp.down_proj.weight
  Renamed: model.model.layers.16.mlp.gate_proj.weight -> model.layers.16.mlp.gate_proj.weight
  Renamed: model.model.layers.16.mlp.up_proj.weight

 60%|██████    | 3/5 [00:43<00:31, 15.80s/it]

  Saved to /content/gemma3_fixed_v2/model-00003-of-00005.safetensors

Processing model-00004-of-00005.safetensors...
  Renamed: model.model.layers.26.input_layernorm.weight -> model.layers.26.input_layernorm.weight
  Renamed: model.model.layers.26.mlp.down_proj.weight -> model.layers.26.mlp.down_proj.weight
  Renamed: model.model.layers.26.post_attention_layernorm.weight -> model.layers.26.post_attention_layernorm.weight
  Renamed: model.model.layers.26.post_feedforward_layernorm.weight -> model.layers.26.post_feedforward_layernorm.weight
  Renamed: model.model.layers.26.pre_feedforward_layernorm.weight -> model.layers.26.pre_feedforward_layernorm.weight
  Renamed: model.model.layers.27.input_layernorm.weight -> model.layers.27.input_layernorm.weight
  Renamed: model.model.layers.27.mlp.down_proj.weight -> model.layers.27.mlp.down_proj.weight
  Renamed: model.model.layers.27.mlp.gate_proj.weight -> model.layers.27.mlp.gate_proj.weight
  Renamed: model.model.layers.27.mlp.up_proj.weight

 80%|████████  | 4/5 [01:01<00:16, 16.42s/it]

  Saved to /content/gemma3_fixed_v2/model-00004-of-00005.safetensors

Processing model-00005-of-00005.safetensors...
  Renamed: model.model.layers.37.input_layernorm.weight -> model.layers.37.input_layernorm.weight
  Renamed: model.model.layers.37.mlp.down_proj.weight -> model.layers.37.mlp.down_proj.weight
  Renamed: model.model.layers.37.post_attention_layernorm.weight -> model.layers.37.post_attention_layernorm.weight
  Renamed: model.model.layers.37.post_feedforward_layernorm.weight -> model.layers.37.post_feedforward_layernorm.weight
  Renamed: model.model.layers.37.pre_feedforward_layernorm.weight -> model.layers.37.pre_feedforward_layernorm.weight
  Renamed: model.model.layers.38.input_layernorm.weight -> model.layers.38.input_layernorm.weight
  Renamed: model.model.layers.38.mlp.down_proj.weight -> model.layers.38.mlp.down_proj.weight
  Renamed: model.model.layers.38.mlp.gate_proj.weight -> model.layers.38.mlp.gate_proj.weight
  Renamed: model.model.layers.38.mlp.up_proj.weight

100%|██████████| 5/5 [01:10<00:00, 14.02s/it]

  Saved to /content/gemma3_fixed_v2/model-00005-of-00005.safetensors

All weight files renamed!





In [None]:
import shutil
from huggingface_hub import HfApi

# Copy config files
files_to_copy = [
    "config.json",
    "generation_config.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "tokenizer.model"
]

for file in files_to_copy:
    try:
        src = f"/content/gemma3_fixed/{file}"
        shutil.copy(src, os.path.join(output_dir, file))
        print(f"Copied {file}")
    except Exception as e:
        print(f"Skipped {file}: {e}")

Copied config.json
Copied generation_config.json
Copied tokenizer.json
Copied tokenizer_config.json
Copied special_tokens_map.json
Copied tokenizer.model


In [None]:
from huggingface_hub import HfApi

api = HfApi()

# Create new repo
api.create_repo(
    repo_id="tdvoroch/gemma3-ndt-merged-fixedv2",
    repo_type="model",
    private=False
)

print("Repo created")

api.upload_folder(
       folder_path="/content/gemma3_fixed",
       repo_id="tdvoroch/gemma3-ndt-merged-fixedv2",
       repo_type="model"
   )

print("Upload complete!")

Repo created


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ma3_fixed/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...0001-of-00005.safetensors:   1%|          | 33.5MB / 4.98GB            

  ...mma3_fixed/tokenizer.json: 100%|##########| 33.4MB / 33.4MB            

  ...0004-of-00005.safetensors:   1%|          | 25.1MB / 4.93GB            

  ...0002-of-00005.safetensors:   1%|          | 25.1MB / 4.93GB            

  ...0003-of-00005.safetensors:   1%|          | 25.1MB / 4.93GB            

  ...0005-of-00005.safetensors:   1%|          | 25.1MB / 4.60GB            

Upload complete!


In [None]:
from safetensors import safe_open

# Check first shard
file_path = "/content/gemma3_fixed_v2/model-00001-of-00005.safetensors"

with safe_open(file_path, framework="pt") as f:
    keys = f.keys()
    # Print first 20 weight names
    for i, key in enumerate(list(keys)[:20]):
        print(key)

model.embed_tokens.weight
model.layers.0.input_layernorm.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.post_attention_layernorm.weight
model.layers.0.post_feedforward_layernorm.weight
model.layers.0.pre_feedforward_layernorm.weight
model.layers.0.self_attn.k_norm.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.q_norm.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.post_attention_layernorm.weight
model.layers.1.post_feedforward_layernorm.weight


In [None]:
from huggingface_hub import hf_hub_download
from safetensors import safe_open
import json
import os

repo_id = "tdvoroch/gemma3-ndt-merged-fixedv2"
output_dir = "/content/gemma3_index_gen"
os.makedirs(output_dir, exist_ok=True)

print("Downloading and processing shards from HuggingFace...")

weight_map = {}
total_size = 0

files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

for file in files:
    print(f"Downloading {file}...")
    file_path = hf_hub_download(
        repo_id=repo_id,
        filename=file,
        cache_dir=output_dir
    )

    print(f"  Processing {file}...")
    with safe_open(file_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            weight_map[key] = file

    total_size += os.path.getsize(file_path)
    print(f"Done")

print(f"\nTotal weights: {len(weight_map)}")
print(f"Total size: {total_size / (1024**3):.2f} GB")

# Create index
index_data = {
    "metadata": {
        "total_size": total_size
    },
    "weight_map": weight_map
}

# Save
index_path = os.path.join(output_dir, "model.safetensors.index.json")
with open(index_path, "w") as f:
    json.dump(index_data, f, indent=2)

print(f"\nSaved to: {index_path}")

Downloading and processing shards from HuggingFace...
Downloading model-00001-of-00005.safetensors...


model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

  Processing model-00001-of-00005.safetensors...
Done
Downloading model-00002-of-00005.safetensors...


model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Processing model-00002-of-00005.safetensors...
Done
Downloading model-00003-of-00005.safetensors...


model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Processing model-00003-of-00005.safetensors...
Done
Downloading model-00004-of-00005.safetensors...


model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  Processing model-00004-of-00005.safetensors...
Done
Downloading model-00005-of-00005.safetensors...


model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

  Processing model-00005-of-00005.safetensors...
Done

Total weights: 1065
Total size: 22.70 GB

Saved to: /content/gemma3_index_gen/model.safetensors.index.json


In [None]:
from huggingface_hub import hf_hub_download
from safetensors import safe_open
import json
import os

repo_id = "tdvoroch/gemma3-ndt-merged-fixedv2"
output_dir = "/content/index_only"
os.makedirs(output_dir, exist_ok=True)

print("Generating corrected index file...")

weight_map = {}
total_size = 0

files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

for file in files:
    print(f"Processing {file}...")
    file_path = hf_hub_download(repo_id=repo_id, filename=file)

    with safe_open(file_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            # Fix the name in the index
            if key.startswith("model.model."):
                fixed_key = key.replace("model.model.", "model.", 1)
            else:
                fixed_key = key

            weight_map[fixed_key] = file

    total_size += os.path.getsize(file_path)

# Saving the index with CORRECTED names
index_data = {
    "metadata": {"total_size": total_size},
    "weight_map": weight_map
}

index_path = os.path.join(output_dir, "model.safetensors.index.json")
with open(index_path, "w") as f:
    json.dump(index_data, f, indent=2)

print(f"\nSaved corrected index to: {index_path}")

Generating corrected index file...
Processing model-00001-of-00005.safetensors...


model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Processing model-00002-of-00005.safetensors...


model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Processing model-00003-of-00005.safetensors...


model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Processing model-00004-of-00005.safetensors...


model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Processing model-00005-of-00005.safetensors...


model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]


Saved corrected index to: /content/index_only/model.safetensors.index.json


In [None]:
from huggingface_hub import hf_hub_download
from safetensors import safe_open

repo_id = "tdvoroch/gemma3-ndt-merged-fixedv2"

print("Checking actual weight names in first shard...")
file_path = hf_hub_download(repo_id=repo_id, filename="model-00001-of-00005.safetensors")

with safe_open(file_path, framework="pt", device="cpu") as f:
    keys = list(f.keys())
    print(f"\nFirst 10 weight names:")
    for key in keys[:10]:
        print(f"  {key}")

Checking actual weight names in first shard...

First 10 weight names:
  model.model.embed_tokens.weight
  model.model.layers.0.input_layernorm.weight
  model.model.layers.0.mlp.down_proj.weight
  model.model.layers.0.mlp.gate_proj.weight
  model.model.layers.0.mlp.up_proj.weight
  model.model.layers.0.post_attention_layernorm.weight
  model.model.layers.0.post_feedforward_layernorm.weight
  model.model.layers.0.pre_feedforward_layernorm.weight
  model.model.layers.0.self_attn.k_norm.weight
  model.model.layers.0.self_attn.k_proj.weight


In [None]:
from safetensors import safe_open
from safetensors.torch import save_file
from huggingface_hub import hf_hub_download
import json
import os

source_repo = "tdvoroch/gemma3-ndt-merged-fixedv2"
work_dir = "/content/gemma3_v3"
os.makedirs(work_dir, exist_ok=True)

files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

weight_map = {}
total_size = 0

for file in files:
    print(f"Processing {file}")
    downloaded_path = hf_hub_download(repo_id=source_repo, filename=file)

    tensors = {}
    with safe_open(downloaded_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            tensors[key] = f.get_tensor(key)

    renamed = {}
    for old_key, tensor in tensors.items():
        if old_key.startswith("model.model."):
            new_key = old_key.replace("model.model.", "model.", 1)
            renamed[new_key] = tensor
            weight_map[new_key] = file
        else:
            renamed[old_key] = tensor
            weight_map[old_key] = file

    output_path = os.path.join(work_dir, file)
    save_file(renamed, output_path)
    total_size += os.path.getsize(output_path)

index_data = {
    "metadata": {"total_size": total_size},
    "weight_map": weight_map
}

with open(os.path.join(work_dir, "model.safetensors.index.json"), "w") as f:
    json.dump(index_data, f, indent=2)

config_files = ["config.json", "generation_config.json", "tokenizer.json",
                "tokenizer_config.json", "special_tokens_map.json", "tokenizer.model",
                "added_tokens.json", "chat_template.jinja", "preprocessor_config.json",
                ".gitattributes", "README.md"]

for file in config_files:
    try:
        path = hf_hub_download(repo_id=source_repo, filename=file)
        output_path = os.path.join(work_dir, file)
        with open(path, 'rb') as src, open(output_path, 'wb') as dst:
            dst.write(src.read())
        print(f"Copied {file}")
    except:
        print(f"Skipped {file}")

print("All files prepared in /content/gemma3_final")

Processing model-00001-of-00005.safetensors
Processing model-00002-of-00005.safetensors
Processing model-00003-of-00005.safetensors
Processing model-00004-of-00005.safetensors
Processing model-00005-of-00005.safetensors


config.json: 0.00B [00:00, ?B/s]

Copied config.json


generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

Copied generation_config.json


tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

Copied tokenizer.json


tokenizer_config.json: 0.00B [00:00, ?B/s]

Copied tokenizer_config.json


special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Copied special_tokens_map.json


tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Copied tokenizer.model


added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

Copied added_tokens.json


chat_template.jinja: 0.00B [00:00, ?B/s]

Copied chat_template.jinja


preprocessor_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

Copied preprocessor_config.json


.gitattributes: 0.00B [00:00, ?B/s]

Copied .gitattributes


README.md: 0.00B [00:00, ?B/s]

Copied README.md
All files prepared in /content/gemma3_final


In [None]:
from safetensors import safe_open
from safetensors.torch import save_file
from huggingface_hub import hf_hub_download
import json
import os

source_repo = "tdvoroch/gemma3-ndt-merged-fixedv2"
work_dir = "/content/gemma3_final"
os.makedirs(work_dir, exist_ok=True)

files = ["model-00001-of-00005.safetensors",
         "model-00002-of-00005.safetensors",
         "model-00003-of-00005.safetensors",
         "model-00004-of-00005.safetensors",
         "model-00005-of-00005.safetensors"]

weight_map = {}
total_size = 0

for file in files:
    print(f"Processing {file}")
    downloaded_path = hf_hub_download(repo_id=source_repo, filename=file)

    tensors = {}
    with safe_open(downloaded_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            tensors[key] = f.get_tensor(key)

    renamed = {}
    for old_key, tensor in tensors.items():
        # Skip vision weights
        if old_key.startswith("vision_tower") or old_key.startswith("multi_modal_projector"):
            continue

        if old_key.startswith("model.model."):
            new_key = old_key.replace("model.model.", "model.", 1)
            renamed[new_key] = tensor
            weight_map[new_key] = file
        else:
            renamed[old_key] = tensor
            weight_map[old_key] = file

    output_path = os.path.join(work_dir, file)
    save_file(renamed, output_path)
    total_size += os.path.getsize(output_path)

index_data = {
    "metadata": {"total_size": total_size},
    "weight_map": weight_map
}

with open(os.path.join(work_dir, "model.safetensors.index.json"), "w") as f:
    json.dump(index_data, f, indent=2)

config_files = ["config.json", "generation_config.json", "tokenizer.json",
                "tokenizer_config.json", "special_tokens_map.json", "tokenizer.model",
                "added_tokens.json", "chat_template.jinja", "preprocessor_config.json",
                ".gitattributes", "README.md"]

for file in config_files:
    try:
        path = hf_hub_download(repo_id=source_repo, filename=file)
        output_path = os.path.join(work_dir, file)
        with open(path, 'rb') as src, open(output_path, 'wb') as dst:
            dst.write(src.read())
        print(f"Copied {file}")
    except:
        print(f"Skipped {file}")

print("Done - vision weights removed")

Processing model-00001-of-00005.safetensors
Processing model-00002-of-00005.safetensors
Processing model-00003-of-00005.safetensors
Processing model-00004-of-00005.safetensors
Processing model-00005-of-00005.safetensors
Copied config.json
Copied generation_config.json
Copied tokenizer.json
Copied tokenizer_config.json
Copied special_tokens_map.json
Copied tokenizer.model
Copied added_tokens.json
Copied chat_template.jinja
Copied preprocessor_config.json
Copied .gitattributes
Copied README.md
Done - vision weights removed


In [None]:
from huggingface_hub import HfApi

new_repo = "tdvoroch/gemma3-ndtv3"
work_dir = "/content/gemma3_final"

api = HfApi()
api.create_repo(repo_id=new_repo, repo_type="model", exist_ok=True)
api.upload_folder(folder_path=work_dir, repo_id=new_repo, repo_type="model")

print("Upload complete")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ma3_final/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

  ...0003-of-00005.safetensors:   1%|          | 33.5MB / 4.93GB            

  ...mma3_final/tokenizer.json:  75%|#######5  | 25.2MB / 33.4MB            

  ...0002-of-00005.safetensors:   1%|          | 25.1MB / 4.93GB            

  ...0005-of-00005.safetensors:   1%|          | 25.1MB / 4.60GB            

  ...0004-of-00005.safetensors:   1%|          | 25.1MB / 4.93GB            

  ...0001-of-00005.safetensors:   1%|          | 33.4MB / 4.14GB            

Upload complete
