In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import PhiForCausalLM

model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = PhiForCausalLM.from_pretrained(model_name).to(device)

In [3]:
import json
from collections import defaultdict

python_texts = defaultdict(list)

replacements = {
    '<EOL>': '\n',
    '<INDENT>': '    ',
    '<DEDENT>': '',
    '<STR_LIT>': '',
    '<NUM_LIT>': '',
}

for split in ["train", "dev", "test"]:
    with open(f"codexglue_method_generation/{split}.jsonl", "r") as file:
        # Read lines and process each one
        for line in file:
            data = json.loads(line)

            signature = data['signature']

            body = data['body']
            for symbol, replacement in replacements.items():
                body = body.replace(symbol, replacement)

            docstring = data['docstring']

            python_texts[split].append("\n    ".join([signature, docstring, body]) + tokenizer.eos_token)

In [4]:
import os
import fnmatch

kotlin_texts = []
for root, dirs, files in os.walk("./kotlin/"):
    for file in files:
        if fnmatch.fnmatch(file, '*.kt'):
            with open(os.path.join(root, file)) as f:
                kotlin_texts.append("".join(f.readlines()))

kotlin_texts = [*filter(None, kotlin_texts)]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_kotlin_texts, test_kotlin_texts = train_test_split(kotlin_texts, test_size=20000)

In [7]:
len(train_kotlin_texts)

34501

In [8]:
len(test_kotlin_texts)

20000

In [9]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   5422 MiB |   5422 MiB |   5422 MiB |      0 B   |
|       from large pool |   5408 MiB |   5408 MiB |   5408 MiB |      0 B   |
|       from small pool |     14 MiB |     14 MiB |     14 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |   5422 MiB |   5422 MiB |   5422 MiB |      0 B   |
|       from large pool |   5408 MiB |   5408 MiB |   5408 MiB |      0 B   |
|       from small pool |     14 MiB |     14 MiB |     14 MiB |      0 B   |
|---------------------------------------------------------------

In [10]:
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, padding=True)
        res =  {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask'])
        }

        del text, encoding

        return res


# Define your fine-tuning data
train_texts = train_kotlin_texts
# val_texts = texts["dev"]

# # Tokenize the data
# train_encodings = tokenizer(train_texts, truncation=True, padding=True)
# val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Define custom dataset
train_dataset = CustomDataset(train_texts, tokenizer)
# val_dataset = CustomDataset(val_encodings)

In [11]:
# Define DataLoader
train_loader = DataLoader(torch.arange(len(train_texts)), batch_size=2, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=1)

for param in model.parameters():
    param.requires_grad = False

for param in model.model.layers[-1].parameters():
    param.requires_grad = True

# Fine-tuning parameters
epochs = 1
learning_rate = 5e-5

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Fine-tuning loop
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_loader, total=len(train_loader))
    for batch_num, batch in enumerate(progress_bar):
        tokens = tokenizer([train_texts[i] for i in batch], padding=True, truncation=True)
        del batch

        input_ids = torch.tensor(tokens['input_ids'], device=device)
        attention_mask = torch.tensor(tokens['attention_mask'], device=device)
        del tokens
        
        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids).loss
        del input_ids, attention_mask

        loss.backward()
        optimizer.step()

        progress_bar.set_description(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}')
        torch.cuda.empty_cache()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")

Epoch [1/1], Loss: 1.1302: 100%|██████████| 17251/17251 [1:44:22<00:00,  2.75it/s]  


In [12]:
test_prompts = []
test_ground_truths = []
with open(f"codexglue_method_generation/dev.jsonl", "r") as file:
    for line in file:
        data = json.loads(line)

        signature = data['signature']

        body = data['body']
        for symbol, replacement in replacements.items():
            body = body.replace(symbol, replacement)

        docstring = data['docstring']


        test_prompts.append("\n    ".join([signature, docstring, ""]))
        test_ground_truths.append(body)

In [13]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

# Define function to calculate BLEU score
def calculate_bleu_score(predictions, references):
    return corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

# Define function to calculate ROUGE score
def calculate_rouge_score(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def generate_code_completions(prompts, max_length=100):
    generated_completions = []
    model.eval()
    with torch.no_grad():
        for prompt in tqdm(prompts):
            prompt_tensor = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                prompt_tensor,
                max_new_tokens=max_length,
                num_return_sequences=1,
                pad_token_id=model.config.pad_token_id,
                eos_token_id=model.config.eos_token_id,
                bos_token_id=model.config.bos_token_id,
            )
            completion = tokenizer.decode(output[0], skip_special_tokens=True)
            generated_completions.append(completion)
    return generated_completions

In [14]:
generated_code_completions = generate_code_completions(test_ground_truths[:100])

100%|██████████| 100/100 [03:32<00:00,  2.13s/it]


In [15]:
# Calculate evaluation metrics
accuracy = sum(1 for pred, gt in zip(generated_code_completions, test_ground_truths) if pred == gt) / len(test_ground_truths[:100])
bleu_score = calculate_bleu_score(generated_code_completions, test_ground_truths[:100])
rouge_score_1, rouge_score_2, rouge_score_l = calculate_rouge_score(generated_code_completions, test_ground_truths[:100])

In [16]:
print("Accuracy:", accuracy)
print("BLEU Score:", bleu_score)
print("ROUGE Score (Rouge-1):", rouge_score_1)
print("ROUGE Score (Rouge-2):", rouge_score_2)
print("ROUGE Score (Rouge-L):", rouge_score_l)

Accuracy: 0.18
BLEU Score: 0.4549734399414975
ROUGE Score (Rouge-1): 0.7116888585890401
ROUGE Score (Rouge-2): 0.6705616467824721
ROUGE Score (Rouge-L): 0.7116888585890401
