In [1]:
from transformers import PhiForCausalLM, AutoTokenizer

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import numpy as np

import utils
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = PhiForCausalLM.from_pretrained(model_name).to(device)

In [3]:
test_codexglue_texts = utils.read_codexglue_test_data()

In [4]:
kotlin_code = utils.extract_kotlin_code()

train_kotlin_prompts, test_kotlin_prompts, train_kotlin_answers, test_kotlin_answers = train_test_split(*kotlin_code, test_size=0.2)

Looking for kotlin files...
Parsing functions in kotlin files...


 59%|█████▊    | 176/300 [00:00<00:00, 263.43it/s]

In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


# Define your fine-tuning data
train_texts = utils.CodeCompletionDataset(train_kotlin_prompts, train_kotlin_answers, train=True)
# val_texts = texts["dev"]

# # Tokenize the data
# train_encodings = tokenizer(train_texts, truncation=True, padding=True)
# val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Define custom dataset
train_dataset = utils.TrainDataset(train_texts, tokenizer)
# val_dataset = CustomDataset(val_encodings)

In [None]:
# Define DataLoader
train_loader = DataLoader(torch.arange(len(train_texts)), batch_size=2, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=1)

for param in model.parameters():
    param.requires_grad = False

for param in model.model.layers[-1].parameters():
    param.requires_grad = True

# Fine-tuning parameters
epochs = 1
learning_rate = 5e-5

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Fine-tuning loop
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_loader, total=len(train_loader))
    for batch_num, batch in enumerate(progress_bar):
        tokens = tokenizer([train_texts[i] for i in batch], padding=True, truncation=True)
        del batch

        input_ids = torch.tensor(tokens['input_ids'], device=device)
        attention_mask = torch.tensor(tokens['attention_mask'], device=device)
        del tokens
        
        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids).loss
        del input_ids, attention_mask

        loss.backward()
        optimizer.step()

        progress_bar.set_description(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}')
        torch.cuda.empty_cache()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")

Epoch [1/1], Loss: 0.7932: 100%|██████████| 242/242 [00:15<00:00, 15.40it/s]


In [None]:
import json

test_prompts = []
test_ground_truths = []
with open(f"codexglue_method_generation/dev.jsonl", "r") as file:
    for line in file:
        data = json.loads(line)

        signature = data['signature']

        body = data['body']
        for symbol, replacement in replacements.items():
            body = body.replace(symbol, replacement)

        docstring = data['docstring']


        test_prompts.append("\n    ".join([signature, docstring, ""]))
        test_ground_truths.append(body)

NameError: name 'json' is not defined

In [None]:
test_prompts, test_ground_truths = test_codexglue_texts

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

# Define function to calculate BLEU score
def calculate_bleu_score(predictions, references):
    return corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

# Define function to calculate ROUGE score
def calculate_rouge_score(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def generate_code_completions(prompts, max_length=100):
    generated_completions = []
    model.eval()
    with torch.no_grad():
        for prompt in tqdm(prompts):
            prompt_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                prompt_tensor,
                max_new_tokens=max_length,
                num_return_sequences=1,
                pad_token_id=model.config.pad_token_id,
                eos_token_id=model.config.eos_token_id,
                bos_token_id=model.config.bos_token_id,
            )
            completion = tokenizer.decode(output[0], skip_special_tokens=True)
            generated_completions.append(completion)
    return generated_completions

In [None]:
generated_code_completions = generate_code_completions(test_ground_truths[:100])

100%|██████████| 100/100 [03:21<00:00,  2.02s/it]


In [None]:
# Calculate evaluation metrics
accuracy = sum(1 for pred, gt in zip(generated_code_completions, test_ground_truths) if pred == gt) / len(test_ground_truths[:100])
bleu_score = calculate_bleu_score(generated_code_completions, test_ground_truths[:100])
rouge_score_1, rouge_score_2, rouge_score_l = calculate_rouge_score(generated_code_completions, test_ground_truths[:100])

In [None]:
print("Accuracy:", accuracy)
print("BLEU Score:", bleu_score)
print("ROUGE Score (Rouge-1):", rouge_score_1)
print("ROUGE Score (Rouge-2):", rouge_score_2)
print("ROUGE Score (Rouge-L):", rouge_score_l)

Accuracy: 0.43
BLEU Score: 0.4105429907470871
ROUGE Score (Rouge-1): 0.8407046422916611
ROUGE Score (Rouge-2): 0.7490669238842099
ROUGE Score (Rouge-L): 0.8407046422916611


In [None]:
utils.evaluate(model, tokenizer, utils.CodeCompletionDataset(test_codexglue_texts[0][:20], test_codexglue_texts[1][:20], train=False))

100%|██████████| 20/20 [00:40<00:00,  2.04s/it]


{'accuracy score': 0.0, 'bleu score': 0.012502301674279289}