<a href="https://colab.research.google.com/github/arvi1999/Authenticate/blob/master/CTM_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("--- Installing the latest compatible fine-tuning libraries... ---")

!pip install -q -U \
    "peft==0.17.1" \
    "bitsandbytes==0.48.2" \
    "trl==0.24.0" \
    "datasets==4.3.0" \
    "transformers_stream_generator==0.0.5" \
    "wandb==0.22.3"

print("\n\n✅ Installation Complete.")
print("🔴🔴🔴 CRITICAL: The session will now restart to load the new libraries. 🔴🔴🔴")

# # This command automatically restarts the Colab runtime.
# import os
# os.kill(os.getpid(), 9)

--- Installing the latest compatible fine-tuning libraries... ---
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers_stream_generator (setup.py) ... [?25l[?25hdone


✅ Installation Complete.
🔴🔴🔴 CRITICAL: The session will now restart to load the new libraries. 🔴🔴🔴


In [2]:
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

import torch
import transformers
import peft
import bitsandbytes
import accelerate
import numpy
import sklearn
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from transformers.trainer_utils import get_last_checkpoint
import os
import wandb

os.environ['WANDB_API_KEY'] = '403c23dfbc2a40c0a5e57f8e583a96c747bf1e1d'

from peft import LoraConfig

print("--- Verifying Environment ---")
print("✅✅✅ Environment is set up correctly! ✅✅✅")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version used by PyTorch: {torch.version.cuda}")

print("\n--- Library Versions ---")
print(f"  - Transformers: {transformers.__version__}")
print(f"  - PEFT:         {peft.__version__}")
print(f"  - accelerate:   {accelerate.__version__}")
print(f"  - bitsandbytes: {bitsandbytes.__version__}")
print(f"  - numpy:        {numpy.__version__}")
print(f"  - scikit-learn: {sklearn.__version__}")
print(f"  - wandb:        {wandb.__version__}")
print("---------------------------\n")


# --- 2. Login wandb
wandb.login()


# --- 3. Configuration ---
model_id = "microsoft/Phi-3-mini-4k-instruct"
# output_dir = "/content/phi3_finetuned_job_category"
## Mounting on the google drive for persistent storage
output_dir = "/content/drive/MyDrive/model/phi3-mini-test-14k-v1"
# dataset_path = "/content/drive/MyDrive/model/training/title_category_training_data.json"
dataset_path = "/content/drive/MyDrive/model/training/ctm-training-jh-v1.json"

# --- 4. Load Dataset ---
print(f"--- Loading dataset from: {dataset_path} ---")
dataset = load_dataset('json', data_files=dataset_path, split='train')
print("Dataset loaded successfully.")

# --- 5. Load Tokenizer and Model ---
print("\n--- Loading Tokenizer and 4-bit Quantized Model ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
model.config.use_cache = False
print("Model loaded successfully.")


# --- 6. Format Dataset with Chat Template ---
def format_chat_template(example):
    system_prompt = """You are an expert job categorizer. Your task is to identify the most appropriate primary category for a given job title. Respond only with the category name. Do not include any other text or explanation. Available categories: Technology, Healthcare, Finance, Marketing, Sales, Management, Customer Service, Human Resources, Business, Creative, Legal, Engineering, Education, Architecture."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Job Title: {example['instruction']}"},
        {"role": "assistant", "content": example['output']}
    ]
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example

print("\n--- Formatting dataset with chat template... ---")
formatted_dataset = dataset.map(format_chat_template)


# --- 7. Configure LoRA and Training Arguments ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=5,
    save_strategy="steps",
    save_steps=5,
    save_total_limit=3,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="wandb",  # Tell the trainer to log to wandb
    run_name=f"phi3-job-classifier-14k-v1",
    # max_seq_length=512,
)

# --- 8. Initialize and Start Training ---
print("\n--- Initializing SFTTrainer ---")
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    # dataset_text_field="text",
    # max_seq_length=512,
    # tokenizer=tokenizer,
    # packing=True,
)

print("\n--- 🚀 Starting Fine-Tuning! 🚀 ---")
# trainer.train()
# Check if a checkpoint exists
last_checkpoint = get_last_checkpoint(training_args.output_dir)

# The trainer will now resume from the last checkpoint if it exists,
# or start from scratch if it does not.
trainer.train(resume_from_checkpoint=last_checkpoint)
# trainer.train(resume_from_checkpoint=True)
print("\n--- ✅ Fine-Tuning Complete! ---")

# --- 8. Save the Final LoRA Adapters ---
print(f"\n--- Saving LoRA adapters to {output_dir} ---")
trainer.save_model(output_dir)
print("Adapters saved successfully. You can download them from the 'Files' sidebar.")

Mounted at /content/drive
Google Drive mounted successfully.
--- Verifying Environment ---
✅✅✅ Environment is set up correctly! ✅✅✅
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version used by PyTorch: 12.6

--- Library Versions ---
  - Transformers: 4.57.1
  - PEFT:         0.17.1
  - accelerate:   1.11.0
  - bitsandbytes: 0.48.2
  - numpy:        2.0.2
  - scikit-learn: 1.6.1
  - wandb:        0.22.3
---------------------------



  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mlovejerry[0m ([33mjob-hai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


--- Loading dataset from: /content/drive/MyDrive/model/training/ctm-training-jh-v1.json ---


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully.

--- Loading Tokenizer and 4-bit Quantized Model ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Model loaded successfully.

--- Formatting dataset with chat template... ---


Map:   0%|          | 0/14017 [00:00<?, ? examples/s]


--- Initializing SFTTrainer ---


Adding EOS to train dataset:   0%|          | 0/14017 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/14017 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/14017 [00:00<?, ? examples/s]


--- 🚀 Starting Fine-Tuning! 🚀 ---




Step,Training Loss
1965,0.149
1970,0.1574
1975,0.1355
1980,0.1436
1985,0.1374
1990,0.157
1995,0.1419
2000,0.152
2005,0.1379
2010,0.1555



--- ✅ Fine-Tuning Complete! ---

--- Saving LoRA adapters to /content/drive/MyDrive/model/phi3-mini-test-14k-v1 ---
Adapters saved successfully. You can download them from the 'Files' sidebar.
