In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import pickle
import numpy as np

Define functions

In [None]:
def language_tokenize(code):
    tokenizer = RegexpTokenizer(r'\w+|[\@\{\}\[\]\(\)\;\.\,\<\>\=\+\-\*\/\!\&\|]+')
    return tokenizer.tokenize(code)

In [None]:
def compute_token_accuracy(predicted_code, expected_code):
    predicted_tokens = language_tokenize(predicted_code)
    expected_tokens = language_tokenize(expected_code)
    
    correct_tokens = sum(1 for x, y in zip(predicted_tokens, expected_tokens) if x == y)
    total_tokens = min(len(expected_tokens), len(predicted_tokens))
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0

    return accuracy

In [None]:
def generate_prompts_and_completions(files, split_ratio=0.85):
    test_data = []
    for file_content in files:
        split_point = int(len(file_content) * split_ratio)
        prompt = file_content[:split_point]
        expected_output = file_content[split_point:]
        test_data.append((prompt, expected_output))
    return test_data

In [None]:
def evaluate_model(model, tokenizer, test_data):
    completion = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
    results = []
    accuracies = []

    for prompt, expected in test_data:

        output = completion(prompt, max_new_tokens=512, max_length=512)
        generated_text = output[0]['generated_text']
        results.append(generated_text)
        acc = compute_token_accuracy(generated_text, expected)
        accuracies.append(acc)

    return results, accuracies

In [None]:
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
def plot_boxplot(accuracies_kt, accuracies_py):
    data = [accuracies_kt, accuracies_py]
    plt.figure(figsize=(8, 6))
    plt.boxplot(data, patch_artist=True, labels=['Kotlin Files', 'Python Files'])
    plt.title('Comparison of Model Accuracies on Kotlin vs. Python Files')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.show()

In [None]:
def load_model_and_tokenizer(model_path="model", tokenizer_path="tokenizer"):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

Load saved files

In [None]:
kotlin_files = load_data('kotlin_files.pkl')
python_files = load_data('python_files.pkl')

In [None]:
model, tokenizer = load_model_and_tokenizer()

Generate prompts and completions

In [None]:
test_files_splited = generate_prompts_and_completions(kotlin_files)
python_files_splited = generate_prompts_and_completions(python_files)

In [None]:
results_kt, accuracies_kt = evaluate_model(model, tokenizer, test_files_splited)
results_py, accuracies_py = evaluate_model(model, tokenizer, python_files_splited)

In [None]:
plot_boxplot(accuracies_kt, accuracies_py)

In [None]:
print(f"Kotlin files mean {np.mean(accuracies_kt)}")
print(f"Kotlin files median {np.median(accuracies_kt)}")
print(f"Python files mean {np.mean(accuracies_py)}")
print(f"Python files median {np.median(accuracies_py)}")

we can see that kotlin files has better mean and median 