In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ---------------------------
# Load and preprocess data
# ---------------------------
def load_data(filepath, text_column=None, label_column=None):
    """Load CSV file with code snippets and labels"""
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} samples")
    print(f"Columns ({len(df.columns)} total):")
    for i, col in enumerate(df.columns):
        print(f"  [{i}]: {col[:100]}...")  # Show first 100 chars of column name

    # Auto-detect columns if not specified
    if text_column is None:
        text_column = df.columns[0]
        print(f"\nAuto-detected text column: {text_column}")

    if label_column is None:
        for col in df.columns[1:]:
            unique_vals = df[col].dropna().unique()
            if len(unique_vals) <= 10 and all(isinstance(v, (int, float, str)) for v in unique_vals):
                label_column = col
                print(f"Auto-detected label column: {label_column}")
                break
        if label_column is None:
            label_column = df.columns[1]
            print(f"Using column {label_column} as label column")

    # Remove NaN values
    df_clean = df[[text_column, label_column]].copy()
    df_clean = df_clean.dropna()

    # Rename columns
    df_clean.columns = ['text', 'label']

    # Convert label to int
    try:
        df_clean['label'] = df_clean['label'].astype(int)
    except:
        unique_labels = df_clean['label'].unique()
        print(f"\nFound non-numeric labels: {unique_labels}")
        label_map = {label: idx for idx, label in enumerate(sorted(unique_labels))}
        print(f"Mapping: {label_map}")
        df_clean['label'] = df_clean['label'].map(label_map)

    # Enforce binary labels (0 = vulnerable, 1 = secure)
    # In your data, scores >=3 are insecure (vulnerable), scores <3 are secure.
    # So, map 0,1,2 -> 1 (Secure), 3,4 -> 0 (Vulnerable)
    df_clean['label'] = df_clean['label'].apply(lambda x: 1 if x < 3 else 0) # Changed mapping

    print(f"\nClass distribution after binary mapping:")
    print(df_clean['label'].value_counts())

    return df_clean

# ---------------------------
# Compute metrics
# ---------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = torch.softmax(torch.tensor(pred.predictions), dim=-1).numpy()

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    try:
        roc_auc = roc_auc_score(labels, probs[:, 1])
    except:
        roc_auc = 0.0

    try:
        pr_auc = average_precision_score(labels, probs[:, 1])
    except:
        pr_auc = 0.0

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }

# ---------------------------
# Detailed evaluation
# ---------------------------
def detailed_evaluation(trainer, test_dataset, tokenizer, output_dir='./results'):
    os.makedirs(output_dir, exist_ok=True)

    print("\n" + "="*70)
    print("DETAILED MODEL EVALUATION")
    print("="*70)

    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids
    probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()

    # Classification report
    print("\n1. CLASSIFICATION REPORT:")
    print("-" * 70)
    print(classification_report(labels, preds, digits=4))

    # Confusion matrix
    print("\n2. CONFUSION MATRIX:")
    print("-" * 70)
    cm = confusion_matrix(labels, preds)
    print(cm)

    # Metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, support = precision_recall_fscore_support(labels, preds, average=None)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro')
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    try:
        roc_auc = roc_auc_score(labels, probs[:, 1])
    except:
        roc_auc = 0.0
    try:
        pr_auc = average_precision_score(labels, probs[:, 1])
    except:
        pr_auc = 0.0

    # Specificity & Sensitivity
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Macro F1: {f1_macro:.4f}, Weighted F1: {f1_weighted:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}, PR-AUC: {pr_auc:.4f}")
    print(f"Specificity: {specificity:.4f}, Sensitivity: {sensitivity:.4f}")

    # Visualizations
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Vulnerable', 'Secure'], yticklabels=['Vulnerable', 'Secure'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'{output_dir}/confusion_matrix.png', dpi=300, bbox_inches='tight')

    fpr, tpr, _ = roc_curve(labels, probs[:, 1])
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.savefig(f'{output_dir}/roc_curve.png', dpi=300, bbox_inches='tight')

    precision_curve, recall_curve, _ = precision_recall_curve(labels, probs[:, 1])
    plt.figure(figsize=(8, 6))
    plt.plot(recall_curve, precision_curve, label=f'PR Curve (AUC = {pr_auc:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.savefig(f'{output_dir}/precision_recall_curve.png', dpi=300, bbox_inches='tight')

    return {
        'accuracy': accuracy,
        'precision': precision_weighted,
        'recall': recall_weighted,
        'f1': f1_weighted,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'confusion_matrix': cm
    }

# ---------------------------
# Train BERT model
# ---------------------------
def train_bert_model(df, output_dir='./results', model_name='./local_codebert_model'):
    os.makedirs(output_dir, exist_ok=True)

    # Ensure the local model path exists
    if not os.path.isdir(model_name):
        raise FileNotFoundError(f"Local model directory not found: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

    dataset = Dataset.from_pandas(df[['text', 'label']])
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test['train']
    test_dataset = train_test['test']

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        # --- MODIFIED LOGGING ---
        logging_dir=f'{output_dir}/logs',
        logging_strategy="no", # Disable logging to potentially avoid progress update errors
        # logging_steps=50, # Commented out
        # --- END MODIFIED LOGGING ---
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        save_total_limit=2
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("\nStarting training...")
    trainer.train()

    metrics = detailed_evaluation(trainer, test_dataset, tokenizer, output_dir)

    print(f"\nSaving model to {output_dir}/final_model")
    model.save_pretrained(f'{output_dir}/final_model')
    tokenizer.save_pretrained(f'{output_dir}/final_model')

    return model, tokenizer, trainer, metrics

# ---------------------------
# Prediction function (CORRECTED for GPU)
# ---------------------------
def predict_code_security(model, tokenizer, code_snippet):
    # Get the device the model is currently on
    device = next(model.parameters()).device

    # Tokenize the input
    inputs = tokenizer(code_snippet, return_tensors='pt', truncation=True, max_length=512, padding=True)
    # Move the input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform prediction
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][prediction].item()

    result = "SECURE" if prediction == 1 else "VULNERABLE"
    return result, confidence * 100, probs[0].tolist()

# ---------------------------
# Generate Predictions for Question Snippets
# ---------------------------
def generate_predictions_for_questions(model, tokenizer, question_file, output_file):
    """Generate predictions for snippets in question_file and save to output_file."""
    print(f"\nGenerating predictions for {question_file}...")

    # Load question snippets
    df_questions = pd.read_csv(question_file)
    print(f"Loaded {len(df_questions)} question snippets.")

    # Create a new column for predictions
    predictions = []
    confidences = []
    prob_vul = []
    prob_sec = []

    # Assuming the text column is the first one
    text_column = df_questions.columns[0]

    for snippet in df_questions[text_column]:
        result, confidence, probs = predict_code_security(model, tokenizer, snippet)
        predictions.append(result)
        confidences.append(confidence)
        prob_vul.append(probs[0]) # Probability of Vulnerable (class 0)
        prob_sec.append(probs[1]) # Probability of Secure (class 1)

    # Add predictions to the dataframe
    df_questions['prediction'] = predictions
    df_questions['confidence'] = confidences
    df_questions['prob_vulnerable'] = prob_vul
    df_questions['prob_secure'] = prob_sec

    # Save to file
    df_questions.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

    return df_questions

# ---------------------------
# Main function
# ---------------------------
def main():
    TRAINING_FILE = 'answer_snippets.annotations.csv'
    QUESTION_FILE = 'question_snippets.csv'
    OUTPUT_DIR = './code_security_results'
    # Define device explicitly
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device for inference: {DEVICE}")

    print("="*70)
    print("CODE SECURITY DEEP LEARNING PIPELINE")
    print("="*70)

    # Load and prepare training data
    df_train = load_data(TRAINING_FILE)

    # Train the model
    model, tokenizer, trainer, metrics = train_bert_model(df_train, OUTPUT_DIR)

    # --- MOVE MODEL TO GPU AFTER TRAINING (CRITICAL FOR PREDICTION) ---
    model = model.to(DEVICE)
    print(f"Model moved to device: {DEVICE}")
    # ----------------------------------------

    # Generate predictions for question snippets
    PREDICTION_OUTPUT_FILE = 'question_snippets.predicts.csv'
    df_predictions = generate_predictions_for_questions(model, tokenizer, QUESTION_FILE, PREDICTION_OUTPUT_FILE)

    # Optionally, show some sample predictions
    print("\n" + "="*70)
    print("SAMPLE PREDICTIONS ON QUESTION SNIPPETS")
    print("="*70)
    sample_size = min(5, len(df_predictions))
    for i in range(sample_size):
        snippet = df_predictions.iloc[i][df_predictions.columns[0]]
        pred = df_predictions.iloc[i]['prediction']
        conf = df_predictions.iloc[i]['confidence']
        prob_vul = df_predictions.iloc[i]['prob_vulnerable']
        prob_sec = df_predictions.iloc[i]['prob_secure']
        print(f"\nSnippet {i+1}:")
        print(f"Text: {snippet[:100]}..." if len(snippet) > 100 else snippet)
        print(f"Prediction: {pred} (Confidence: {conf:.2f}%)")
        print(f"Probabilities: Vulnerable={prob_vul:.4f}, Secure={prob_sec:.4f}")

    print("\n" + "="*70)
    print("PIPELINE COMPLETE!")
    print("="*70)

if __name__ == "__main__":
    main()

Using device for inference: cuda
CODE SECURITY DEEP LEARNING PIPELINE
Loaded 1359 samples
Columns (4 total):
  [0]:      import java.security.SecureRandom;
     import javax.crypto.Cipher;
     import javax.crypto.Ke...
  [1]: 4...
  [2]: 0...
  [3]: 262b6bdf80dd616de599361c5bc2d1a3547a736969ccf6d9f2ff5b94a609c0e2...

Auto-detected text column:      import java.security.SecureRandom;
     import javax.crypto.Cipher;
     import javax.crypto.KeyGenerator;
     import javax.crypto.SecretKey;
     import javax.crypto.spec.SecretKeySpec;

     public class SimpleCrypto {

public  String encrypt(String seed, String cleartext) throws Exception {
        byte[] rawKey = getRawKey(seed.getBytes());
        byte[] result = encrypt(rawKey, cleartext.getBytes());
        return toHex(result);
}

public  String decrypt(String seed, String encrypted) throws Exception {
        byte[] rawKey = getRawKey(seed.getBytes());
        byte[] enc = toByte(encrypted);
        byte[] result = decrypt(rawKey,

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./local_codebert_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1017 [00:00<?, ? examples/s]


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Pr Auc
1,No log,0.418644,0.828431,0.0,0.0,0.0,0.727473,0.397745
2,No log,0.40918,0.833333,0.6,0.085714,0.15,0.74235,0.375441
3,No log,0.405736,0.789216,0.346154,0.257143,0.295082,0.768385,0.383455


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



DETAILED MODEL EVALUATION



1. CLASSIFICATION REPORT:
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0     0.8539    0.8994    0.8761       169
           1     0.3462    0.2571    0.2951        35

    accuracy                         0.7892       204
   macro avg     0.6000    0.5783    0.5856       204
weighted avg     0.7668    0.7892    0.7764       204


2. CONFUSION MATRIX:
----------------------------------------------------------------------
[[152  17]
 [ 26   9]]

Accuracy: 0.7892
Macro F1: 0.5856, Weighted F1: 0.7764
ROC-AUC: 0.7684, PR-AUC: 0.3835
Specificity: 0.8994, Sensitivity: 0.2571

Saving model to ./code_security_results/final_model
Model moved to device: cuda

Generating predictions for question_snippets.csv...
Loaded 2503 question snippets.
