In [None]:
import wandb

# Replace with your actual API key
wandb_api_key = "30f827999aedd60e84d5c0815f557a8ba3462638"

# Login to wandb
wandb.login(key=wandb_api_key)


In [None]:
class KnowledgePreservationLoss(nn.Module):
    def __init__(self, alpha=0.7):
        super().__init__()
        self.alpha = alpha
        self.kl_div = nn.KLDivLoss(reduction='batchmean')

    def forward(self, student_logits, teacher_logits, math_mask):
        # math_mask identifies mathematical tokens
        base_loss = self.kl_div(student_logits, teacher_logits)
        math_loss = self.kl_div(student_logits[math_mask],
                              teacher_logits[math_mask])
        return self.alpha * math_loss + (1 - self.alpha) * base_loss


In [None]:
class MobileLongformer(Module):
    def __init__(self, base_model):
        self.window_size = 65536
        self.overlap = 512
        self.kv_cache = CompressedCache(num_bits=4, ecc_bits=2)

    def forward(self, input_ids):
        # Process in chunks with overlapping regions
        outputs = []
        for i in range(0, len(input_ids), self.window_size):
            chunk = input_ids[i:i+self.window_size+self.overlap]
            chunk_out = self.base_model(chunk, kv_cache=self.kv_cache)
            outputs.append(chunk_out[:-self.overlap])
        return torch.cat(outputs)


In [None]:
from datasets import load_dataset

math_data = load_dataset("qwen/math-corpus-v2", split="train")
blockchain_data = load_dataset("web3/technical-docs", split="train")

def filter_non_technical(example):
    return example["category"] in ("math", "blockchain")

train_data = concatenate_datasets([math_data, blockchain_data]).filter(filter_non_technical)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import wandb

# Initialize Weights & Biases
wandb.init(project="qwen-distill",
           config={
               "base_model": "qwen/Qwen2.5-1.5B",
               "target_size": "600MB",
               "max_length": 200_000
           })

# Load teacher model (full Qwen 2.5 7B)
teacher = AutoModelForCausalLM.from_pretrained("qwen/Qwen2.5-7B",
                                             device_map="auto")

# Initialize student with pruned architecture
student = create_pruned_model(teacher, keep_layers=[4,5,6,7,8,9,10,11,12])

# Configure mobile-optimized trainer
from transformers import MobileTrainingArguments

args = MobileTrainingArguments(
    output_dir="qwen-math-distilled",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    report_to="wandb",
    mobile_optimizations=True  # Enables ARM NEON instructions
)


In [None]:
from huggingface_hub import HfApi

api = HfApi(token="f_VfSYPuYfShOGDpZpMVjEGwVIfXrgWVwnaJ")
api.create_repo(repo_id="Ayansk11/qwen-math-distilled", private=True)

student.push_to_hub(
    "username/qwen-math-distilled",
    commit_message="Initial distilled release",
    max_shard_size="200MB"
)

# Generate model card
with open("README.md", "w") as f:
    f.write(f"""---
license: apache-2.0
base_model: qwen/Qwen2.5-1.5B
tags:
- mathematics
- blockchain
- mobile
---

# Distilled Qwen 2.5 Math/Blockchain Specialist
""")

api.upload_file(
    repo_id="username/qwen-math-distilled",
    path_in_repo="README.md",
    path_or_fileobj="README.md"
)
