In [None]:
# 1. Setup - install required packages, imports, token input
!pip install transformers torch radon matplotlib seaborn ipywidgets --quiet
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import radon.complexity as radon_cc
import radon.metrics as radon_metrics
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd

# Hugging Face token input (store securely in Colab Secrets if possible)
HFTOKEN = input("Enter your Hugging Face token: ")

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Utility functions for metrics
def evaluate_code_metrics(code):
    try:
        complexity = radon_cc.cc_visit(code)
        cyclomatic = sum(block.complexity for block in complexity)
    except Exception:
        cyclomatic = None
    try:
        maintainability = radon_metrics.mi_visit(code, True)
    except Exception:
        maintainability = None
    loc = len(code.split('\n'))
    return {'Cyclomatic Complexity': cyclomatic, 'Maintainability Index': maintainability, 'LOC': loc}

# Function to generate python code given model & tokenizer & prompt
def generate_code(model, tokenizer, prompt, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=max_length)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Prepare a dataframe to collect metrics
metrics_df = pd.DataFrame(columns=['Model', 'Prompt', 'Code', 'Cyclomatic Complexity', 'Maintainability Index', 'LOC'])


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hEnter your Hugging Face token: HF_TOKEN
Using device: cpu


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import ipywidgets as widgets
from IPython.display import display, clear_output

import pandas as pd

# 🔹 Define your Hugging Face token and device
HFTOKEN = "<your_huggingface_token_here>"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 🔹 Global dataframe for collecting metrics
metrics_df = pd.DataFrame(columns=['Model', 'Prompt', 'Code', 'Cyclomatic Complexity', 'Maintainability Index', 'LOC'])

def evaluate_code_metrics(code):
    """Compute code quality metrics using Radon."""
    try:
        complexity = radon_cc.cc_visit(code)
        cyclomatic = sum(block.complexity for block in complexity)
    except Exception:
        cyclomatic = None

    try:
        maintainability = radon_metrics.mi_visit(code, True)
    except Exception:
        maintainability = None

    loc = len(code.split('\n'))
    return {
        'Cyclomatic Complexity': cyclomatic,
        'Maintainability Index': maintainability,
        'LOC': loc
    }

def generate_code(model, tokenizer, prompt, max_length=256):
    """Generate code from the model given a text prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_length, do_sample=True, temperature=0.7)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove prompt if repeated
    return decoded.replace(prompt, "").strip()

def model_cell(model_name, hf_model_id):
    """Create an interactive widget cell for code generation and evaluation."""
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(hf_model_id, token=HFTOKEN)
    model = AutoModelForCausalLM.from_pretrained(
        hf_model_id,
        token=HFTOKEN,
        device_map="auto",
        torch_dtype=torch.bfloat16
    )
    print(f"{model_name} loaded and ready!")

    # --- Widgets ---
    prompt_input = widgets.Textarea(
        value="# Write a Python function to compute factorial of a number.",
        placeholder="Enter your prompt here",
        description="Prompt:",
        layout=widgets.Layout(width="100%", height="80px")
    )

    generate_button = widgets.Button(description=f"Generate with {model_name}")
    output_area = widgets.Output()

    def on_generate(b):
        with output_area:
            clear_output(wait=True)
            prompt = prompt_input.value
            print(f"Prompt:\n{prompt}\n")

            generated_code = generate_code(model, tokenizer, prompt)
            print(f"Generated Code:\n{generated_code}\n")

            metrics = evaluate_code_metrics(generated_code)
            print(f"Metrics:")
            print(f"  Cyclomatic Complexity: {metrics['Cyclomatic Complexity']}")
            print(f"  Maintainability Index: {metrics['Maintainability Index']}")
            print(f"  Lines of Code (LOC): {metrics['LOC']}")

            # Update global dataframe safely
            global metrics_df
            new_row = pd.DataFrame([{
                'Model': model_name,
                'Prompt': prompt,
                'Code': generated_code,
                **metrics
            }])
            metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

    generate_button.on_click(on_generate)
    display(prompt_input, generate_button, output_area)


In [3]:
# 3. Batch testing script (can be in another cell)
def batch_test(models_dict, prompts_list):
    global metrics_df
    for model_name, hf_model in models_dict.items():
        print(f"Batch testing {model_name}...",flush=True)
        tokenizer = AutoTokenizer.from_pretrained(hf_model, use_auth_token=HFTOKEN)
        model = AutoModelForCausalLM.from_pretrained(hf_model, use_auth_token=HFTOKEN, device_map="auto", torch_dtype=torch.bfloat16).to(device)
        for prompt in prompts_list:
            generated_code = generate_code(model, tokenizer, prompt)
            metrics = evaluate_code_metrics(generated_code)
            new_row = pd.DataFrame([{
                'Model': model_name,
                'Prompt': prompt,
                'Code': generated_code,
                'Cyclomatic Complexity': metrics['Cyclomatic Complexity'],
                'Maintainability Index': metrics['Maintainability Index'],
                'LOC': metrics['LOC']
            }])
            metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
        print(f"Completed batch testing for {model_name}.",flush=True)

In [1]:
# 4. Visualization cell (use after batch testing)
def visualize_metrics():
    import matplotlib.pyplot as plt
    import seaborn as sns
    df = metrics_df.dropna()
    plt.figure(figsize=(18, 5))
    plt.subplot(1, 3, 1)
    sns.barplot(data=df, x="Model", y="Cyclomatic Complexity")
    plt.title("Cyclomatic Complexity by Model")
    plt.subplot(1, 3, 2)
    sns.barplot(data=df, x="Model", y="Maintainability Index")
    plt.title("Maintainability Index by Model")
    plt.subplot(1, 3, 3)
    sns.barplot(data=df, x="Model", y="LOC")
    plt.title("Lines of Code by Model")
    plt.tight_layout()
    plt.show()

# Example predefined prompts to test
sample_prompts = [
    "Write a Python function to compute factorial of a number.",
    "Implement bubble sort algorithm in Python.",
    "Create a Python program to read and write files.",
    "Generate Python code for a simple REST API using Flask.",
    "Write Python code to connect to a MySQL database.",
    "Create a Python class for a linked list.",
    "Write Python code to parse JSON files.",
    "Write a Python script to scrape data from a website.",
    "Generate Python code to visualize data using matplotlib.",
    "Write a Python function to compute Fibonacci numbers."
]

# Dictionary of all models
models = {
    "DeepSeek-Coder-1.3B": "deepseek-ai/deepseek-coder-1.3b-instruct",
    "Phi-2-2.7B": "microsoft/phi-2",
    "Gemma-2B-IT": "google/gemma-2b-it",
    "Stable-Code-3B": "stabilityai/stable-code-3b",
    "Replit-Code-3B": "replit/replit-code-v1-3b"
}

# To run batch testing in a cell:
# batch_test(models, sample_prompts)

# To visualize after testing:
# visualize_metrics()

# You can run each model cell independently like:
# model_cell("DeepSeek-Coder-1.3B", "deepseek-ai/deepseek-coder-1.3b-instruct")