## Logging into my HuggingFace account for using StarCoder, as it is a gated rep

In [None]:
from huggingface_hub import login
login(token="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")

## Reading the test data which is being for evaluation purpose

In [None]:
import pandas as pd
# Define the questions and corresponding code snippets
data = pd.read_csv("/home/baskar/CALIX_LLM/CodeGenEvalPipeline/experiments/python_questions_answers.csv")
df = pd.DataFrame(data)
df

## Downloading the Tokenizer and the Generating model (here : StarCoder)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from langchain.llms import HuggingFaceHub
import accelerate
import bitsandbytes

# Initialize Starcoder model

model_name = 'bigcode/starcoder'

#quantize the model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config = bnb_config)

## Generating Code from the DataFrame that being created from the test data

In [None]:
# Define function to generate code
def generate_code_from_question(question: str, tokenizer, model) -> str:
    inputs = tokenizer(question, truncation=True)
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'].to(model.device),
            max_length=150,  # Adjust based on your needs
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply code generation
data['generated_code'] = df['question'].apply(lambda q: generate_code_from_question(q, tokenizer, model))

# Display the DataFrame with generated code
df

In [None]:
print(df["generated_code"][0])

## Evaluating Cyclomatic Complexity

Cyclomatic complexity is calculated based on the control flow graph of a program. The graph consists of nodes (representing blocks of code) and edges (representing control flow between these blocks).

The cyclomatic complexity V(G)=E−N+ 2P:
        # Where E - Number of edges in the control flow graph, N - Number of nodes in the control flow graph, P - Number of connected components in the            graph (usually 1 for a single program)

In [None]:
from radon.complexity import cc_visit, cc_rank
def cyclomatic_complexity(generated_code):
    # Define the code you want to analyze
    code = generated_code
    
    # Compute the cyclomatic complexity
    complexity_info = cc_visit(code)
    
    # Display the results
    for item in complexity_info:
        print(f"Function Name: {item.name}")
        print(f"Cyclomatic Complexity: {item.complexity}")
        print(f"Complexity Rank: {cc_rank(item.complexity)}")
df['generated_code'].apply(lambda q: cyclomatic_complexity(q))

## ROUGE score
The ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score is a set of metrics used to evaluate the quality of summaries or text generated by comparing them to reference text.

In [None]:
from rouge_score import rouge_scorer

# Example ground truth and generated code
ground_truth_code = ground_truth

# Create a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
scores = scorer.score(ground_truth_code, generated_code)

# Print the ROUGE scores
for key in scores:
    print(f"{key}: Precision: {scores[key].precision:.4f}, Recall: {scores[key].recall:.4f}, F1-Score: {scores[key].fmeasure:.4f}")

## Linting
Linting is the process of analyzing source code to identify and report potential errors, stylistic issues, and other inconsistencies

In [None]:
import pylint
import subprocess

code = generated_code

with open('lint_test.py', 'w') as f:
    f.write(code)

def lint_with_pylint(file_path):
    """Run pylint on the specified file and return the output."""
    result = subprocess.run(['pylint', file_path], capture_output=True, text=True)
    return result.stdout

# Check the code and print results
pylint_output = lint_with_pylint('example.py')
print("pylint Linting Results:")
print(pylint_output)

## Embeddings Similarity 
Embedding similarity is a technique used to measure the similarity between data points (e.g., words, sentences, or documents) by comparing their vector representations. Embeddings are dense vector representations of data points that capture semantic meaning, and similarity measures help determine how closely related or similar two embeddings are.

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model for code embeddings
model = SentenceTransformer('microsoft/codebert-base')

# Example generated code and ground truth code
generated_code = generated_code
ground_truth_code = ground_truth

# Compute embeddings
generated_embedding = model.encode(generated_code)
ground_truth_embedding = model.encode(ground_truth_code)

# Compute cosine similarity
similarity = util.cos_sim(generated_embedding, ground_truth_embedding)

print(f"Code Similarity: {similarity.item():.4f}")