<a href="https://colab.research.google.com/github/ashwathdnd/Age-Recognition-Model/blob/main/SEC_Compliance_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Install necessary libraries ---
# We use ! to run shell commands in Colab
print("Installing necessary libraries...")
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U datasets

print("Libraries installed successfully!")

# --- 2. Log in to Hugging Face ---
# To download the Llama 3 model, you need a Hugging Face account and a token.
# 1. Go to https://huggingface.co/settings/tokens to get your token.
# 2. When you run this cell, a box will appear. Paste your token and press Enter.
from huggingface_hub import login

print("Please log in to Hugging Face.")
login()

print("Login successful!")

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

# --- This cell no longer contains TrainingArguments or pipeline ---

# --- Configure Model Quantization (for efficiency) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# --- Load the Llama 3 Model & Tokenizer ---
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
print(f"\nLoading base model: {model_id}")

# Load the model with our 4-bit configuration
# We have REMOVED the device_map argument to let the Trainer handle it.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print("Model and tokenizer loaded successfully!")

In [None]:
# Install the 'trl' library which contains the SFTTrainer
!pip install -q trl

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# --- 1. Load the Dataset ---
dataset = load_dataset("json", data_files="/content/training_data.jsonl", split="train")

# --- 2. Preprocess the Dataset ---
def preprocess_function(examples):
    text = f"### INSTRUCTION:\nAnalyze the following text for SEC Marketing Rule compliance.\n\n### TEXT:\n{examples['text']}\n\n### ANALYSIS:\n{examples['explanation']}"
    return tokenizer(text, truncation=True, max_length=512)

processed_dataset = dataset.map(preprocess_function, remove_columns=dataset.column_names)

# --- 3. Configure LoRA and Prepare the Model ---
lora_config = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- 4. Set Up the Training Arguments ---
# --- THIS IS THE SECTION WE ARE FIXING ---
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    # This new line disables all external logging, including wandb.
    report_to="none",
)

# --- 5. Create the Trainer and Start Training! ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=processed_dataset,
    data_collator=data_collator,
)

print("Starting the fine-tuning process...")
trainer.train()
print("Fine-tuning complete!")

# --- 6. Save the Final Model ---
final_model_path = "./final_model"
trainer.model.save_pretrained(final_model_path)
print(f"Final fine-tuned model saved to {final_model_path}")

In [None]:
import torch
from transformers import pipeline
from peft import PeftModel

# The model and tokenizer should still be loaded in memory,
# but we redefine them here for clarity in this self-contained cell.
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

final_model_path = "./final_model"
fine_tuned_model = PeftModel.from_pretrained(base_model, final_model_path)

# --- Let's run a new test with corrected parameters ---

# Test case with non-compliant language
test_text = "Our new 'Guaranteed Income Fund' is the safest investment on the market, providing certain returns year after year."

# --- THIS IS THE PROMPT FIX ---
# We make the instruction more specific to guide the model's output format.
prompt = f"""### INSTRUCTION:
Analyze the following text for SEC Marketing Rule compliance. Output your final analysis as a single, raw JSON object only.

### TEXT:
{test_text}

### ANALYSIS:
"""

# --- THIS IS THE MAX_LENGTH FIX ---
# We use the pipeline with a larger max_length.
pipe = pipeline(
    task="text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    max_length=512  # Increased from 200
)

result = pipe(prompt)

# Print out the generated analysis
print("\n--- Model Analysis ---")
print(result[0]['generated_text'])

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

# --- 1. Load the Base Model and Tokenizer ---
# This loads the original Llama 3 model in 4-bit precision.
print("Loading base model and tokenizer...")
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto" # Let accelerate handle device mapping
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# --- 2. Load Your Fine-Tuned Adapter ---
# This is where we load your specialized knowledge from the './final_model' folder
# and apply it to the base model.
print("Loading fine-tuned LoRA adapter...")
final_model_path = "./final_model"
fine_tuned_model = PeftModel.from_pretrained(base_model, final_model_path)

print("Model ready for inference!")


# --- 3. Run a Test ---
print("\nRunning test inference...")
# This is the test case from before
test_text = "Our new 'Guaranteed Income Fund' is the safest investment on the market, providing certain returns year after year."

# We format the test case into the prompt structure the model learned
prompt = f"""### INSTRUCTION:
Analyze the following text for SEC Marketing Rule compliance. Output your final analysis as a single, raw JSON object only.

### TEXT:
{test_text}

### ANALYSIS:
"""

# Use the transformers pipeline for easy text generation
pipe = pipeline(
    task="text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    max_length=512
)

result = pipe(prompt)

# Print out the generated analysis
print("\n--- Model Analysis ---")
print(result[0]['generated_text'])

In [None]:
import nbformat

# Replace with the name of your notebook file
notebook_path = '/SEC Compliance Finetuning.ipynb'

# Read the notebook
with open(notebook_path, 'r', encoding='utf-8') as f:
    nb = nbformat.read(f, as_version=4)

# Check if the problematic metadata exists and remove it
if 'widgets' in nb.metadata:
    del nb.metadata['widgets']

# Write the corrected notebook back to the file
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbformat.write(nb, f)

print(f"Corrected notebook saved to {notebook_path}")