In [1]:
from transformers import PhiForCausalLM, AutoTokenizer

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import numpy as np

import utils
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = PhiForCausalLM.from_pretrained(model_name).to(device)

In [None]:
kotlin_code = utils.extract_kotlin_code()
train_kotlin_prompts, test_kotlin_prompts, train_kotlin_answers, test_kotlin_answers = train_test_split(*kotlin_code, test_size=0.2)

train_texts = utils.CodeCompletionDataset(train_kotlin_prompts, train_kotlin_answers, train=True)
train_dataset = utils.TrainDataset(train_texts, tokenizer)

test_kotlin_dataset = utils.CodeCompletionDataset(test_kotlin_prompts, test_kotlin_answers, train=False)
test_codexglue_dataset = utils.CodeCompletionDataset(utils.read_codexglue_test_data(), train=False)

In [None]:
model = utils.train_model(model, tokenizer, train_dataset)

In [None]:
test_prompts, test_ground_truths = test_codexglue_texts
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

# Define function to calculate BLEU score
def calculate_bleu_score(predictions, references):
    return corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

# Define function to calculate ROUGE score
def calculate_rouge_score(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def generate_code_completions(prompts, max_length=100):
    generated_completions = []
    model.eval()
    with torch.no_grad():
        for prompt in tqdm(prompts):
            prompt_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                prompt_tensor,
                max_new_tokens=max_length,
                num_return_sequences=1,
                pad_token_id=model.config.pad_token_id,
                eos_token_id=model.config.eos_token_id,
                bos_token_id=model.config.bos_token_id,
            )
            completion = tokenizer.decode(output[0], skip_special_tokens=True)
            generated_completions.append(completion)
    return generated_completions
generated_code_completions = generate_code_completions(test_ground_truths[:100])
# Calculate evaluation metrics
accuracy = sum(1 for pred, gt in zip(generated_code_completions, test_ground_truths) if pred == gt) / len(test_ground_truths[:100])
bleu_score = calculate_bleu_score(generated_code_completions, test_ground_truths[:100])
rouge_score_1, rouge_score_2, rouge_score_l = calculate_rouge_score(generated_code_completions, test_ground_truths[:100])
print("Accuracy:", accuracy)
print("BLEU Score:", bleu_score)
print("ROUGE Score (Rouge-1):", rouge_score_1)
print("ROUGE Score (Rouge-2):", rouge_score_2)
print("ROUGE Score (Rouge-L):", rouge_score_l)

In [None]:
utils.evaluate(model, tokenizer, utils.CodeCompletionDataset(test_codexglue_texts[0][:20], test_codexglue_texts[1][:20], train=False))

100%|██████████| 20/20 [00:40<00:00,  2.04s/it]


{'accuracy score': 0.0, 'bleu score': 0.012502301674279289}