### 1. Define functions

In [None]:
import os
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline
import torch
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import re
import pickle

function to fetch files with given extension \
I remove multi-line comments because they are mostly unnecessary

In [None]:
def extract_kotlin_files(source_dir, extension='.kt'):
    code = []
    comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)

    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(extension):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_content = f.read()
                    file_content = re.sub(comment_pattern, '', file_content)
                    code.append(file_content.strip())
    return code

In [None]:
def save_data(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

split data to train/test set

In [None]:
def split_data(dataset):
    train_files, test_files = train_test_split(dataset, test_size=0.2, random_state=42)
    valid_files, test_files = train_test_split(test_files, test_size=0.5, random_state=42)
    return (train_files, test_files, valid_files)

model train

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
def plot_loss(results):
    plt.figure(figsize=(10, 5))
    plt.plot(results['train_loss'], label='Train Loss')
    plt.plot(results['validation_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Over Time')
    plt.legend()
    plt.show()

In [None]:
def fine_tune_model(train_dataset, valid_dataset, model_name="microsoft/phi-1_5"):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Model and tokenizer loaded.")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    train_encodings = tokenizer(train_dataset, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    valid_encodings = tokenizer(valid_dataset, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    train_dataset = MyDataset(train_encodings)
    valid_dataset = MyDataset(valid_encodings)
    print("Datasets prepared for training.")

    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='steps',
        eval_steps=10,
        save_strategy='steps',
        save_steps=10,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset
    )

    trainer.train()

    train_loss = [log['loss'] for log in trainer.state.log_history if 'loss' in log]
    validation_loss = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]
    results = {
        'train_loss': train_loss,
        'validation_loss': validation_loss
    }
    plot_loss(results)
    
    return model, tokenizer

In [None]:
def save_model_and_tokenizer(model, tokenizer, model_path="model", tokenizer_path="tokenizer"):
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)

### 2. Run code

extract kotlin and python files

In [None]:
kotlin_files = extract_kotlin_files("/path_to_open_source_project")
print(len(kotlin_files))
train_files, test_files, valid_files = split_data(kotlin_files)
save_data(test_files, 'kotlin_files.pkl')

Train model

In [None]:
model, tokenizer = fine_tune_model(train_files, valid_files)

In [None]:
save_model_and_tokenizer(model, tokenizer)