# üîí SCA Package Model Training - VS Code/Local

Train CodeLlama 7B to detect package vulnerabilities using 2024-2025 CVE data

**Requirements:**
- Python 3.10+
- 16GB+ RAM (CPU training) OR GPU with 15GB+ VRAM
- Dataset already in: `/workspaces/ai_sec/datasets/sca_training_2024_2025.json`

**Note:** Training on CPU will take 24-48+ hours. Use Google Colab for faster GPU training.

## Step 1: Check Hardware

In [2]:
import torch
print(f"üñ•Ô∏è  GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üìä GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected - training will be VERY slow (24-48+ hours)")
    print("üí° Recommendation: Use Google Colab for free GPU training")

üñ•Ô∏è  GPU Available: False
‚ö†Ô∏è  No GPU detected - training will be VERY slow (24-48+ hours)
üí° Recommendation: Use Google Colab for free GPU training


## Step 2: Install Dependencies

In [None]:
import subprocess
import sys

print("üì¶ Installing dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    "transformers==4.37.0",
    "datasets==2.16.0", 
    "peft==0.8.0",
    "accelerate==0.26.0",
    "sentencepiece"
])
print("‚úÖ Dependencies installed!")

## Step 3: Load Dataset

In [None]:
from datasets import load_dataset
import os

DATASET_PATH = "/workspaces/ai_sec/datasets/sca_training_2024_2025.json"

print(f"üìÇ Loading dataset from: {DATASET_PATH}")

# Check if file exists
if not os.path.exists(DATASET_PATH):
    print(f"‚ùå Dataset not found at {DATASET_PATH}")
    print("Run: python scripts/convert_cve_to_training.py")
    raise FileNotFoundError(f"Dataset not found: {DATASET_PATH}")

# Load dataset
dataset = load_dataset('json', data_files=DATASET_PATH)

# Split into train/validation
dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)

print(f"\nüìä Dataset Statistics:")
print(f"  Training samples: {len(dataset['train'])}")
print(f"  Validation samples: {len(dataset['test'])}")

# Show sample
print(f"\nüìù Sample training example:")
print(dataset['train'][0]['text'][:600] + "...")

## Step 4: Load Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("üì• Loading CodeLlama-7b-Instruct...")
print("  (This takes 2-3 minutes)")

if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-Instruct-hf",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    print("  ‚úÖ Model loaded in float16 (~14 GB)")
else:
    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-Instruct-hf",
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    print("  ‚úÖ Model loaded on CPU")

tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("\n‚úÖ Model and tokenizer ready!")

## Step 5: Configure LoRA (Train only 0.5% of parameters)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("‚ÑπÔ∏è  Applying LoRA (training only 0.5% of model parameters)")
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

## Step 6: Tokenize Dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=2048,
        padding="max_length",
    )

print("üîÑ Tokenizing dataset...")

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

print("‚úÖ Dataset tokenized!")

## Step 7: Configure Training

In [None]:
from transformers import TrainingArguments, Trainer

use_fp16 = torch.cuda.is_available()

training_args = TrainingArguments(
    output_dir="/workspaces/ai_sec/models/sca-package-checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    fp16=use_fp16,
    save_strategy="steps",
    save_steps=100,
    logging_steps=10,
    warmup_steps=50,
    optim="adamw_torch",
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    report_to="none",
    eval_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

print("‚úÖ Trainer configured!")
if use_fp16:
    print(f"\n‚è±Ô∏è  Estimated training time: 4-6 hours on GPU")
else:
    print(f"\n‚è±Ô∏è  Estimated training time: 24+ hours on CPU")
print(f"üíæ Checkpoints: /workspaces/ai_sec/models/sca-package-checkpoints")

## Step 8: START TRAINING! üöÄ

**Warning:** This will take hours! Make sure your machine stays on.

In [None]:
print("üöÄ Starting training...")
print("‚è∞ This will take several hours")
print("\n" + "="*60)

trainer.train()

print("\n" + "="*60)
print("‚úÖ Training complete!")
print("="*60)

## Step 9: Save Final Model

In [None]:
output_dir = "/workspaces/ai_sec/models/sca-package-final"

print(f"üíæ Saving final model to: {output_dir}")

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("‚úÖ Model saved successfully!")
print(f"\nüìÅ Model location: {output_dir}")

## Step 10: Test the Model! üß™

In [None]:
print("üß™ Testing the trained model...\n")

test_input = """[INST] Analyze this package.json for known vulnerabilities

```json
{
  "name": "my-app",
  "dependencies": {
    "express": "4.16.0",
    "lodash": "4.17.4",
    "axios": "0.18.0"
  }
}
``` [/INST]"""

device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.1,
    do_sample=True,
    top_p=0.95
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("="*60)
print("ü§ñ MODEL OUTPUT:")
print("="*60)
print(result)
print("="*60)

## üéâ Training Complete!

Your SCA model is now trained and saved!

**Model Location:** `/workspaces/ai_sec/models/sca-package-final`

**Next Steps:**
1. Deploy the model using vLLM or Ollama
2. Integrate into your CI/CD pipeline
3. Train more models for SAST, IaC, Container scanning