## 1. Setup and Installations
Explanation: This first section prepares the environment. It installs all the necessary libraries for the project, including ipywidgets for the user interface, transformers for the AI models, radon for code metrics, and seaborn for plotting.



In [1]:
# --- Section 1: Setup & Installations ---
print("Installing required libraries...")
!pip install transformers torch accelerate bitsandbytes pandas huggingface_hub radon ipywidgets matplotlib seaborn -q

# Import necessary modules
import ipywidgets as widgets
from IPython.display import display, HTML
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import notebook_login
import ast
import re
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from radon.complexity import cc_visit
from radon.metrics import mi_visit
from radon.raw import analyze

print("\nSetup Complete! ✅")

Installing required libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25h
Setup Complete! ✅


## 2. Secure Hugging Face Login
Explanation: This section handles authentication, which is required for any "gated" models you may use. Run this cell and enter your Hugging Face access token when prompted.



## 3. Configuration & Backend Engine
Explanation: This section defines the models to test and all the backend helper functions, including the updated calculate_advanced_metrics function which now computes Halstead metrics.



In [2]:
# --- Section 3: Configuration & Backend Engine ---
# --- Model Configuration ---
MODELS_TO_TEST = {
    "DeepSeek-Coder-1.3B": "deepseek-ai/deepseek-coder-1.3b-instruct",
    "Phi-2-2.7B": "microsoft/phi-2",
    "Gemma-2B-IT": "google/gemma-2b-it",
}
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Helper & Generation Functions ---
def clean_generated_code(text, model_path):
    model_path = model_path.lower()
    if "gemma" in model_path: text = re.sub(r"<start_of_turn>user\n.*<end_of_turn>\n<start_of_turn>model\n", "", text, flags=re.DOTALL).replace("<end_of_turn>", "")
    elif "phi-2" in model_path: text = re.sub(r"Instruct:.*\nOutput:", "", text, flags=re.DOTALL)
    else: text = re.sub(r"### Instruction:\n.*\n\n### Response:", "", text, flags=re.DOTALL)
    match = re.search(r"```python\n(.*?)\n```", text, re.DOTALL)
    if match: text = match.group(1)
    return text.strip()

def is_syntactically_valid(code_string: str) -> bool:
    if not code_string: return False
    try: ast.parse(code_string); return True
    except SyntaxError: return False

def calculate_advanced_metrics(code_string):
    if not is_syntactically_valid(code_string):
        return {"complexity": None, "maintainability": None, "loc": None}
    try:
        complexity = sum([c.complexity for c in cc_visit(code_string)]) if cc_visit(code_string) else 0
        maintainability = mi_visit(code_string, multi=True)
        loc = analyze(code_string).loc
        return {"complexity": complexity, "maintainability": round(float(maintainability), 2), "loc": loc}
    except: return {"complexity": None, "maintainability": None, "loc": None}

def generate_code(model, tokenizer, prompt):
    model_path = model.name_or_path.lower()
    if "gemma" in model_path: formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    elif "phi-2" in model_path: formatted_prompt = f"Instruct: {prompt}\nOutput:"
    else: formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:"
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(formatted_prompt, return_tensors="pt", return_attention_mask=True).to(device)
    start_time = time.time()
    output_ids = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512, temperature=0.1, do_sample=True, pad_token_id=tokenizer.pad_token_id)
    end_time = time.time()

    raw_output = tokenizer.batch_decode(output_ids)[0]
    cleaned_code = clean_generated_code(raw_output, model_path)

    return {"code": cleaned_code, "gen_time": end_time - start_time}

print("Backend engine with advanced metrics is ready.")

Backend engine with advanced metrics is ready.


## 4. Pre-Loading All AI Models
Explanation: This section pre-loads all models into memory. This will take several minutes but will result in a much faster UI experience. Warning: This will use a significant amount of your GPU memory.

In [3]:
# --- Section 4: Pre-Loading All Models ---
loaded_models = {}
print("Starting to pre-load all models...")
for model_name, model_path in MODELS_TO_TEST.items():
    print(f"\n--- Loading {model_name}... ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
        loaded_models[model_name] = {"model": model, "tokenizer": tokenizer}
        print(f"✅ {model_name} loaded successfully.")
    except Exception as e:
        print(f"✗ FAILED to load {model_name}. Error: {e}")
print("\n" + "="*50 + "\nAll available models are pre-loaded.\n" + "="*50)

Starting to pre-load all models...

--- Loading DeepSeek-Coder-1.3B... ---


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

✅ DeepSeek-Coder-1.3B loaded successfully.

--- Loading Phi-2-2.7B... ---


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Phi-2-2.7B loaded successfully.

--- Loading Gemma-2B-IT... ---


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

✅ Gemma-2B-IT loaded successfully.

All available models are pre-loaded.


## 5. UI #1: Benchmark All Models
Explanation: This first interface is for broad, comparative benchmarking. Enter a single prompt, and it will run against every pre-loaded model, displaying the results and metrics in a table.

In [4]:
# --- Section 5: UI #1 - Run All Models ---
print("--- UI #1: Benchmark All Models ---")
all_results_log = [] # Global log to store all results from both UIs

prompt_input_all = widgets.Textarea(placeholder='Enter a prompt to benchmark all models...', layout={'width': '95%'})
run_all_button = widgets.Button(description='Run Benchmark', button_style='danger', icon='rocket')
output_all = widgets.Output(layout={'border': '1px solid black', 'padding': '10px', 'overflow': 'scroll'})

def on_run_all_clicked(b):
    with output_all:
        prompt = prompt_input_all.value
        if not prompt: print("Please enter a prompt."); return

        print(f"Running prompt on {len(loaded_models)} models...")
        results_this_run = []
        for model_name, components in loaded_models.items():
            print(f"  - Generating with {model_name}...")
            result = generate_code(components['model'], components['tokenizer'], prompt)
            metrics = calculate_advanced_metrics(result['code'])

            entry = {'Model': model_name, 'Prompt': prompt, **result, **metrics}
            results_this_run.append(entry)
            all_results_log.append(entry)

        print("\n--- Benchmark Complete ---")
        results_df = pd.DataFrame(results_this_run).round(2)
        display(HTML(results_df.to_html().replace('\\n', '<br>')))

run_all_button.on_click(on_run_all_clicked)
display(widgets.VBox([prompt_input_all, run_all_button, output_all]))

--- UI #1: Benchmark All Models ---


VBox(children=(Textarea(value='', layout=Layout(width='95%'), placeholder='Enter a prompt to benchmark all mod…

## 6. UI #2: Inspect Models with Checkboxes
Explanation: This second interface provides more control. It creates a checkbox for each model, allowing you to run a prompt on a specific subset of the pre-loaded models.


In [5]:
print("\n\n--- UI #2: Inspect Selected Models (Multi-Prompt Version) ---")

# Text area for multiple prompts
prompt_input_selected = widgets.Textarea(
    placeholder='Enter up to 10 prompts, each on a new line...',
    layout={'width': '95%', 'height': '150px'}
)

run_selected_button = widgets.Button(
    description='Run Selected',
    button_style='success',
    icon='play'
)

output_selected = widgets.Output(
    layout={'border': '1px solid black', 'padding': '10px', 'overflow': 'scroll'}
)

# Checkboxes for selecting models
model_checkboxes = {name: widgets.Checkbox(value=True, description=name) for name in loaded_models.keys()}
checkbox_container = widgets.VBox(list(model_checkboxes.values()))

# Button click event
def on_run_selected_clicked(b):
    with output_selected:
        output_selected.clear_output(wait=True)
        prompts_raw = prompt_input_selected.value.strip()
        if not prompts_raw:
            print("Please enter one or more prompts.")
            return

        # Split by line and take up to 10 prompts
        prompts = [p.strip() for p in prompts_raw.splitlines() if p.strip()]
        if not prompts:
            print("No valid prompts found.")
            return
        if len(prompts) > 10:
            print("Warning: Only the first 10 prompts will be used.")
            prompts = prompts[:10]

        models_to_run = [name for name, cb in model_checkboxes.items() if cb.value]
        if not models_to_run:
            print("Please select at least one model.")
            return

        print(f"Running {len(prompts)} prompts on {len(models_to_run)} selected models...\n")

        results_this_run = []

        # Run each prompt through each model
        for idx, prompt in enumerate(prompts, 1):
            print(f"\nPrompt {idx}: {prompt}\n")
            for model_name in models_to_run:
                print(f"  - Generating with {model_name}...")
                components = loaded_models[model_name]
                result = generate_code(components['model'], components['tokenizer'], prompt)
                metrics = calculate_advanced_metrics(result['code'])

                entry = {'Model': model_name, 'Prompt': prompt, **result, **metrics}
                results_this_run.append(entry)
                all_results_log.append(entry)

        print("\n--- Multi-Prompt Run Complete ---")
        results_df = pd.DataFrame(results_this_run).round(2)
        display(HTML(results_df.to_html().replace('\\n', '<br>')))

# Attach event handler
run_selected_button.on_click(on_run_selected_clicked)

# Build and display UI
ui_selected_models = widgets.VBox([
    widgets.HTML("<h4>Enter up to 10 prompts (one per line):</h4>"),
    prompt_input_selected,
    widgets.HTML("<h4>Select models to run:</h4>"),
    checkbox_container,
    run_selected_button,
    output_selected
])

display(ui_selected_models)




--- UI #2: Inspect Selected Models (Multi-Prompt Version) ---


VBox(children=(HTML(value='<h4>Enter up to 10 prompts (one per line):</h4>'), Textarea(value='', layout=Layout…

## 7. Final Analysis and Visualization Report
Explanation: After using either UI to generate results, run this section. A button will appear that generates a comprehensive report, including a full data table and comparative plots for the key metrics across all tests run in your session.

In [10]:
# --- Section 7: Final Analysis and Visualization Report ---
report_button = widgets.Button(description="Generate Full Report & Plots", button_style='info')
report_output = widgets.Output()

def on_report_button_clicked(b):
    with report_output:
        report_output.clear_output(wait=True)
        if not all_results_log:
            print("No results logged. Use one of the UIs above to generate code.")
            return

        df = pd.DataFrame(all_results_log).round(2)
        df.rename(columns={'complexity': 'Complexity', 'maintainability': 'Maintainability', 'gen_time': 'Gen Time (s)'}, inplace=True)

        print("--- Full Session Data ---")
        display(df)

        print("\n--- Comparative Plots ---")
        sns.set_theme(style="whitegrid")

        plot_df = df.dropna(subset=['Complexity', 'Maintainability'])

        if plot_df.empty:
            print("Not enough valid data to generate plots.")
            return

        fig, axes = plt.subplots(1, 3, figsize=(20, 6))
        fig.suptitle('Comparative Analysis of Code Metrics', fontsize=16)

        sns.barplot(ax=axes[0], data=plot_df, x='Model', y='Gen Time (s)', palette='viridis')
        axes[0].set_title('Generation Time (Lower is Faster)')
        axes[0].tick_params(axis='x', rotation=45)

        sns.barplot(ax=axes[1], data=plot_df, x='Model', y='Complexity', palette='magma')
        axes[1].set_title('Cyclomatic Complexity (Lower is Simpler)')
        axes[1].tick_params(axis='x', rotation=45)

        sns.barplot(ax=axes[2], data=plot_df, x='Model', y='Maintainability', palette='plasma')
        axes[2].set_title('Maintainability Index (Higher is Better)')
        axes[2].tick_params(axis='x', rotation=45)

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()

report_button.on_click(on_report_button_clicked)
print("\nUse the button below to generate the final report for the session.")
display(widgets.VBox([report_button, report_output]))


Use the button below to generate the final report for the session.


VBox(children=(Button(button_style='info', description='Generate Full Report & Plots', style=ButtonStyle()), O…

## 8. (Optional) Manual Cleanup
Explanation: Run this cell to manually clear all pre-loaded models from memory and free up your GPU resources.



In [None]:
# --- Section 8: Optional Manual Cleanup ---
def clear_all_models():
    global loaded_models
    print(f"Clearing {len(loaded_models)} models from memory...")
    for model_name in list(loaded_models.keys()):
        del loaded_models[model_name]['model']
        del loaded_models[model_name]['tokenizer']
        del loaded_models[model_name]
    torch.cuda.empty_cache()
    print("\n✅ All models have been cleared from GPU memory.")

# To run the cleanup, uncomment and run the line below:
clear_all_models()

Clearing 3 models from memory...

✅ All models have been cleared from GPU memory.
