In [None]:
from datasets import load_from_disk, load_dataset
from datasets.dataset_dict import DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
#login into huggingface account
"""login(token="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")"""

In [None]:
data = pd.read_csv("/home/baskar/CALIX_LLM/CodeGenEvalPipeline/experiments/python_questions_and_code.csv")
question = pd.DataFrame(data["Question"])
code = pd.DataFrame(data["Code"])
question, code

In [None]:
temp_dataset = dataset["compile"]
temp_dataset = dataset["compile"].remove_columns(["most_similar", "avg_similarity_score"])

In [None]:
# Load a tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder")

In [None]:
def tokenize_function(dataset):
    
    # Tokenize the code snippetsataset
    
    return tokenizer(data["Question"], truncation=True)
                    
def filter_function(dataset):
    
    #filter dataset and keep only the code containing code of length 20 to 50 length
    
    return len(data['Question']) > 20 and len(data['Question']) < 50
    
tokenized_dataset = temp_dataset.map(tokenize_function, batched=True)
filtered_dataset = tokenized_dataset.filter(filter_function)
filtered_dataset

In [None]:
from langchain.llms import HuggingFaceHub
import os
from getpass import getpass

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass("HF Token:")

In [None]:
# Initialize the Hugging Face Hub with StarCoder
starcoder = HuggingFaceHub(repo_id="bigcode/starcoder", model_kwargs={"temperature": 0.5, "max_tokens":100})

# Test the model with a simple input
prompt = "write a python function for sum of first 10 natural numbers"
generated_code = starcoder(prompt)
generated_code = "#" + generated_code
generated_code

In [None]:
from pydantic import BaseModel, ValidationError
import re

# Define a Pydantic model for the structure you expect
class FunctionModel(BaseModel):
    function_name: str
    arguments: list[str]
    return_type: str
    code: str

In [None]:
# Function to parse the generated code
def parse_function_code(generated_code: str):
    # Extract function name
    function_name_match = re.search(r"writedef (\w+)\(import#", generated_code)
    function_name = function_name_match.group(1) if function_name_match else None

    # Extract arguments
    arguments_match = re.search(r"write def \w+\((.*?)\)import#", generated_code)
    arguments = arguments_match.group(1).split(", ") if arguments_match else []

    # Extract return type (if any)
    return_type_match = re.search(r"-> (\w+):", generated_code)
    return_type = return_type_match.group(1) if return_type_match else "None"

    # Strip the code to include just the function body
    code_body = "\n".join(generated_code.splitlines()[1:]).strip()

    # Create a FunctionModel instance
    try:
        function_model = FunctionModel(
            function_name=function_name,
            arguments=arguments,
            return_type=return_type,
            code=code_body
        )
        return function_model
    except ValidationError as e:
        print("Validation Error:", e)
        return None

# Parse and validate the generated code
structured_code = parse_function_code(generated_code)

print("\nStructured Output:")
print(structured_code)

In [None]:
from radon.complexity import cc_visit, cc_rank

# Define the code you want to analyze
code = generated_code

# Compute the cyclomatic complexity
complexity_info = cc_visit(code)

# Display the results
for item in complexity_info:
    print(f"Function Name: {item.name}")
    print(f"Cyclomatic Complexity: {item.complexity}")
    print(f"Complexity Rank: {cc_rank(item.complexity)}")

In [None]:
import pylint
import subprocess

code = generated_code

with open('example.py', 'w') as f:
    f.write(code)

def lint_with_pylint(file_path):
    """Run pylint on the specified file and return the output."""
    result = subprocess.run(['pylint', file_path], capture_output=True, text=True)
    return result.stdout

# Check the code and print results
pylint_output = lint_with_pylint('example.py')
print("pylint Linting Results:")
print(pylint_output)

In [None]:
ground_truth = """def Fibonacci(n):

    # Check if input is 0 then it will
    # print incorrect input
    if n < 0:
        print("Incorrect input")

    # Check if n is 0
    # then it will return 0
    elif n == 0:
        return 0

    # Check if n is 1,2
    # it will return 1
    elif n == 1 or n == 2:
        return 1

    else:
        return Fibonacci(n-1) + Fibonacci(n-2)"""

import nltk
from nltk.translate.bleu_score import corpus_bleu

# Ensure you have the necessary NLTK data
nltk.download('punkt')

# Example ground truth and generated code
reference_code = [
    [generated_code]
]

generated_code = [
    [ground_truth]
]

# Convert references and generated code to the required format
# `references` needs to be a list of lists of lists
references = [[reference_code]]  # List of lists, where each sublist is a list of references
hypotheses = [generated_code]    # List of hypotheses

# Calculate BLEU score
bleu_score = corpus_bleu([reference_code], generated_code)
print(f"BLEU score: {bleu_score:.4f}")

In [None]:
from rouge_score import rouge_scorer

# Example ground truth and generated code
ground_truth_code = ground_truth

# Create a ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
scores = scorer.score(ground_truth_code, generated_code)

# Print the ROUGE scores
for key in scores:
    print(f"{key}: Precision: {scores[key].precision:.4f}, Recall: {scores[key].recall:.4f}, F1-Score: {scores[key].fmeasure:.4f}")

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model for code embeddings
model = SentenceTransformer('microsoft/codebert-base')

# Example generated code and ground truth code
generated_code = generated_code
ground_truth_code = ground_truth

# Compute embeddings
generated_embedding = model.encode(generated_code)
ground_truth_embedding = model.encode(ground_truth_code)

# Compute cosine similarity
similarity = util.cos_sim(generated_embedding, ground_truth_embedding)

print(f"Code Similarity: {similarity.item():.4f}")

In [1]:
from langchain.llms import HuggingFaceHub
import os
from getpass import getpass

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass("HF Token:")

HF Token: ········


In [2]:
from huggingface_hub import login
login(token="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")

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/baskar/.cache/huggingface/token
Login successful


In [22]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from langchain.llms import HuggingFaceHub

# Define the questions and corresponding code snippets
# Define the questions and corresponding code snippets
data = {
    'question': [
        'Write a Python function to reverse a string.',
        'Create a function to check if a number is even.',
        'Write a function to find the maximum of three numbers.',
        'Create a function to calculate the factorial of a number.',
        'Write a Python function to count the number of vowels in a string.',
        'Create a function to check if a string is a palindrome.',
        'Write a function to sort a list of integers in ascending order.',
        'Create a function to generate a Fibonacci sequence up to n numbers.',
        'Write a Python function to merge two dictionaries.',
        'Create a function to read a file and count the number of lines in it.'
    ],
    'code': [
        '''def reverse_string(s):
    return s[::-1]''',
        
        '''def is_even(num):
    return num % 2 == 0''',
        
        '''def max_of_three(a, b, c):
    return max(a, b, c)''',
        
        '''def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n - 1)''',
        
        '''def count_vowels(s):
    vowels = 'aeiou'
    count = sum(1 for char in s.lower() if char in vowels)
    return count''',
        
        '''def is_palindrome(s):
    s = s.lower()
    return s == s[::-1]''',
        
        '''def sort_list(lst):
    return sorted(lst)''',
        
        '''def fibonacci(n):
    sequence = []
    a, b = 0, 1
    while a < n:
        sequence.append(a)
        a, b = b, a + b
    return sequence''',
        
        '''def merge_dicts(dict1, dict2):
    result = dict1.copy()
    result.update(dict2)
    return result''',
        
        '''def count_lines_in_file(file_path):
    with open(file_path, 'r') as file:
        return len(file.readlines())'''
    ]
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Create the DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)
df = pd.DataFrame(data)

# Initialize Starcoder model
model_name = 'bigcode/starcoder2-3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = HuggingFaceHub(repo_id=model_name, model_kwargs={"temperature": 0.5, "max_tokens":100})

                                            question  \
0       Write a Python function to reverse a string.   
1    Create a function to check if a number is even.   
2  Write a function to find the maximum of three ...   
3  Create a function to calculate the factorial o...   
4  Write a Python function to count the number of...   
5  Create a function to check if a string is a pa...   
6  Write a function to sort a list of integers in...   
7  Create a function to generate a Fibonacci sequ...   
8  Write a Python function to merge two dictionar...   
9  Create a function to read a file and count the...   

                                                code  
0         def reverse_string(s):\n    return s[::-1]  
1         def is_even(num):\n    return num % 2 == 0  
2  def max_of_three(a, b, c):\n    return max(a, ...  
3  def factorial(n):\n    if n == 0:\n        ret...  
4  def count_vowels(s):\n    vowels = 'aeiou'\n  ...  
5  def is_palindrome(s):\n    s = s.lower()\n    ... 



In [25]:
# Define function to generate code
def generate_code_from_question(question: str, tokenizer, model) -> str:
    inputs = tokenizer(question, truncation=True)
    outputs = model(question)
    outputs = "#" + outputs
    return outputs

# Apply code generation
df['generated_code'] = df['question'].apply(lambda q: generate_code_from_question(q, tokenizer, model))

# Display the DataFrame with generated code
df

Unnamed: 0,question,code,generated_code
0,Write a Python function to reverse a string.,def reverse_string(s):\n return s[::-1],#Write a Python function to reverse a string. ...
1,Create a function to check if a number is even.,def is_even(num):\n return num % 2 == 0,#Create a function to check if a number is eve...
2,Write a function to find the maximum of three ...,"def max_of_three(a, b, c):\n return max(a, ...",#Write a function to find the maximum of three...
3,Create a function to calculate the factorial o...,def factorial(n):\n if n == 0:\n ret...,#Create a function to calculate the factorial ...
4,Write a Python function to count the number of...,def count_vowels(s):\n vowels = 'aeiou'\n ...,#Write a Python function to count the number o...
5,Create a function to check if a string is a pa...,def is_palindrome(s):\n s = s.lower()\n ...,#Create a function to check if a string is a p...
6,Write a function to sort a list of integers in...,def sort_list(lst):\n return sorted(lst),#Write a function to sort a list of integers i...
7,Create a function to generate a Fibonacci sequ...,"def fibonacci(n):\n sequence = []\n a, b...",#Create a function to generate a Fibonacci seq...
8,Write a Python function to merge two dictionar...,"def merge_dicts(dict1, dict2):\n result = d...",#Write a Python function to merge two dictiona...
9,Create a function to read a file and count the...,def count_lines_in_file(file_path):\n with ...,#Create a function to read a file and count th...


In [27]:
print(df["generated_code"][0])

#Write a Python function to reverse a string. 
# -

# Write your code here
def rever(str):
    return str[::-1]
print(rever("Hello World"))


# + [markdown] button=false new_sheet=false run_control={"read_only": false}
# Double-click __here__ for the solution.
# <!-- 
# def reverse(string):
#     length = len(string)
#     reverse = ''
#    


In [28]:
from radon.complexity import cc_visit, cc_rank
def cyclomatic_complexity(generated_code):
    # Define the code you want to analyze
    code = generated_code
    
    # Compute the cyclomatic complexity
    complexity_info = cc_visit(code)
    
    # Display the results
    for item in complexity_info:
        print(f"Function Name: {item.name}")
        print(f"Cyclomatic Complexity: {item.complexity}")
        print(f"Complexity Rank: {cc_rank(item.complexity)}")
df['generated_code'].apply(lambda q: cyclomatic_complexity(q))

Function Name: rever
Cyclomatic Complexity: 1
Complexity Rank: A


SyntaxError: invalid syntax (<unknown>, line 3)