In [12]:
from numpy.random import choice
from math import sqrt
from transformers import RobertaTokenizer, RobertaModel, GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import numpy as np

# KMeans clustering class definition
class KMeans():
    def __init__(self, num_centroids) -> None:
        self.num_centroids = num_centroids
        self.centroids = []

    def select_centroids(self, data):
        indices = [i for i in range(data.shape[0])]
        centroid_indices = choice(indices, self.num_centroids, replace=False)
        self.centroids = [data[i] for i in centroid_indices]

    def calculate_distance(self, point1, point2):
        squared_diff = [(point1[0, i] - point2[0, i]) ** 2 for i in range(point1.shape[1])]
        distance = sqrt(sum(squared_diff))
        return distance

    def find_closest_centroid(self, point):
        min_distance = self.calculate_distance(point, self.centroids[0])
        closest_centroid_idx = 0

        for i in range(1, len(self.centroids)):
            distance = self.calculate_distance(point, self.centroids[i])
            if distance < min_distance:
                min_distance = distance
                closest_centroid_idx = i

        return closest_centroid_idx

    def calculate_centroid_sum(self, data, assignments, centroid_idx):
        return sum([data[i] for i in range(data.shape[0]) if assignments[i] == centroid_idx])

    def count_points_assigned_to_centroid(self, assignments, centroid_idx):
        return assignments.count(centroid_idx)

    def train(self, data):
        self.select_centroids(data)
        converged = False

        while not converged:
            assignments = [self.find_closest_centroid(data[i]) for i in range(data.shape[0])]
            max_centroid_shift = -1

            for centroid_idx in range(self.num_centroids):
                if self.count_points_assigned_to_centroid(assignments, centroid_idx) != 0:
                    new_centroid = self.calculate_centroid_sum(data, assignments, centroid_idx) / self.count_points_assigned_to_centroid(assignments, centroid_idx)
                else:
                    new_centroid = self.centroids[centroid_idx]
                distance = self.calculate_distance(self.centroids[centroid_idx], new_centroid)

                if distance > max_centroid_shift:
                    max_centroid_shift = distance
                self.centroids[centroid_idx] = new_centroid

            if max_centroid_shift < 0.05:
                converged = True

    def predict(self, data):
        return [self.find_closest_centroid(point) for point in data]

# Functions to get embeddings using CodeBERT
def get_embedding_for_code_snippet(code_snippet) -> np.array:
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model = RobertaModel.from_pretrained("microsoft/codebert-base")

    inputs = tokenizer(code_snippet, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state.mean(dim=1).numpy()
    return embedding

# Function to read code snippet from a file
def read_code_snippet(file_path):
    with open(file_path, "r") as file:
        code = file.read()
    return code

# Function to read CSV file
def read_from_file(file_path):
    return pd.read_csv(file_path)

# Function to get the scope from a CSV file based on index
def get_scope(file_path, scope_index):
    dataframe = read_from_file(file_path)
    scopes = list(set(dataframe["scop"]))
    return scopes[scope_index]

# Function to generate and train the KMeans classifier
def generate_classifier(file_path):
    dataframe = read_from_file(file_path)
    code_paths = dataframe["code_snippet"]
    scopes = dataframe["scop"]
    num_centroids = len(set(scopes))

    code_snippets = [read_code_snippet(file) for file in code_paths]
    embeddings = [get_embedding_for_code_snippet(code) for code in code_snippets]

    classifier = KMeans(num_centroids)
    classifier.train(np.array(embeddings))
    return classifier

# Function to generate the scope for a given code
def generate_code_scope(code, classifier):
    embedding = get_embedding_for_code_snippet(code)
    return classifier.predict(np.array([embedding]))[0]

# Example usage
classifier = generate_classifier("codes.csv")

code = """def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)
"""
scope_index = generate_code_scope(code, classifier)
code_scope = get_scope("codes.csv", scope_index)
print(code_scope)

code = """def fibonacci(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a
"""
scope_index = generate_code_scope(code, classifier)
code_scope = get_scope("codes.csv", scope_index)
print(code_scope)

Algoritmi pe siruri de caractere
Algoritmi pe siruri de caractere


In [13]:
# Function to generate comments using GPT-2
def generate_comments_with_model(code):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    prompt = "Generate a detailed comment for the following Python function:\n\n" + code + "\n\nComment:\n"
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, temperature=0.8, do_sample=True, top_k=50)
    generated_comment = tokenizer.decode(outputs[0], skip_special_tokens=True)

    comment_start = generated_comment.find("Comment:") + len("Comment:")
    if comment_start == -1:
        return "No comment generated"
    comment = generated_comment[comment_start:].strip()

    return comment

python_code_without_comments = """
def find_maximum(numbers):
    max_number = numbers[0]
    for number in numbers:
        if number > max_number:
            max_number = number
    return max_number
"""

generated_comment = generate_comments_with_model(python_code_without_comments)
print(generated_comment)

python_code_without_comments = """
def reverse_string(s):
    reversed_s = ''
    for char in s:
        reversed_s = char + reversed_s
    return reversed_s
"""

generated_comment = generate_comments_with_model(python_code_without_comments)
print(generated_comment)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Python's max_number is a very important property of Python. Not only does it determine the value of a number, but also whether it is actually a valid number.

The above method looks up the minimum and maximum number to get a list of all possible integers. Here's an example:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


http://www.python.org/reference/python_string_types.html#reverse_string and http://www.python.org/reference/python_string_types.html#reverse_string is a variant of reverse_tuple with the same semantics as described in http://support.python.org/download/index.html and http://support.python.org/download


In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model():
    """
    Load the tokenizer and model for code generation.
    """
    tokenizer = AutoTokenizer.from_pretrained("Daoguang/PyCodeGPT")
    model = AutoModelForCausalLM.from_pretrained("Daoguang/PyCodeGPT")
    return tokenizer, model

def generate_code(description, tokenizer, model, max_length=100):
    """
    Generate code from a given description using the provided tokenizer and model.

    Args:
        description (str): The text description to generate code from.
        tokenizer (AutoTokenizer): The tokenizer for encoding the input.
        model (AutoModelForCausalLM): The model for generating the code.
        max_length (int): The maximum length of the generated code. Default is 100.

    Returns:
        str: The generated code.
    """
    # Encode the input description
    inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True)

    # Generate code from the description
    outputs = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'], 
        max_length=max_length, 
        num_return_sequences=1
    )

    # Decode and return the generated code
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

# Load the model and tokenizer
tokenizer, model = load_model()

# Example 1: Generate a function to add two numbers
description1 = "Write a Python function to verify if a number is prime"
generated_code1 = generate_code(description1, tokenizer, model)
print("Example 1 - Generated Code:")
print(generated_code1)

# Example 2: Generate a function to find the factorial of a number
description2 = "Write a Python function to find the factorial of a number"
generated_code2 = generate_code(description2, tokenizer, model)
print("\nExample 2 - Generated Code:")
print(generated_code2)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Example 1 - Generated Code:
Write a Python function to verify if a number is prime.

def isPrime(n):
    if n < 2:
        return False
    for i in range(2, int(math.sqrt(n))+1):
        if n % i == 0:
            return False
    return True

print(isPrime(10))

# This is a test function.
# It will print the number of primes below 10.

# This is a test function.

Example 2 - Generated Code:
Write a Python function to find the factorial of a number.

def factorial(n):
    if n == 0:
        return 1
    return n * factorial(n-1)

print(factorial(10))

# This is a Python function that takes a number and returns the factorial of it.

def factorial(n):
    if n == 0:
        return 1
    return n * factorial(n-1)

print(factorial(10))

