In [None]:
import json
with open('fine_tuning_parameters.json', 'r') as f:
    fine_tuning_parameters_json = json.load(f)




# =============================================================================
transformer_family = fine_tuning_parameters_json['transformer_family']
tokenizer_family = fine_tuning_parameters_json['tokenizer_family']
optimizer_family = fine_tuning_parameters_json['optimizer_family']
scheduler_family = fine_tuning_parameters_json['scheduler_family']

if transformer_family == 'AutoModelForCausalLM':
    from transformers import AutoModelForCausalLM as Model_Family
elif transformer_family == 'AutoModelForSequenceClassification':
    from transformers import AutoModelForSequenceClassification as Model_Family
elif transformer_family == 'AutoModelForTokenClassification':
    from transformers import AutoModelForTokenClassification as Model_Family
#TODO: Add more families, error handling and more efficient structure

if tokenizer_family == 'AutoTokenizer':
    from transformers import AutoTokenizer as Tokenizer_Family
elif tokenizer_family == 'AutoTokenizerFast':
    from transformers import AutoTokenizerFast as Tokenizer_Family

if optimizer_family == 'AdamW':
    from transformers import AdamW as Optimizer_Family
elif optimizer_family == 'Adam':
    from transformers import Adam as Optimizer_Family
elif optimizer_family == 'SGD':
    from transformers import SGD as Optimizer_Family

if scheduler_family == 'get_linear_schedule_with_warmup':
    from transformers import get_linear_schedule_with_warmup as Scheduler_Family
elif scheduler_family == 'get_constant_schedule_with_warmup':
    from transformers import get_constant_schedule_with_warmup as Scheduler_Family
elif scheduler_family == 'get_cosine_schedule_with_warmup':
    from transformers import get_cosine_schedule_with_warmup as Scheduler_Family

import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# =============================================================================

def train(dataloader, optimizer, device='cuda'):
    model.train() 
    for batch in tqdm(dataloader, total=len(dataloader), desc='Training...'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)        
        outputs = model(input_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

def evaluate(dataloader, device='cuda'):
    model.eval()
    losses = []
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
        loss = outputs[0].item()
        losses.append(loss)
    return np.mean(losses)

def process_dataframe(df, languages, max_seq_len, tokenizer, num_repos):
    ds = []
    for i in tqdm(range(num_repos), desc='Processing dataframe Repositories'):
        for language in languages:
            code_dict = eval(df[language][i])
            files_names = list(code_dict.keys())
            for file_name in files_names:
                code = code_dict[file_name]
                inputs = tokenizer(code, return_tensors='pt', max_length=max_seq_len, truncation=True, padding='max_length')
                input_ids = inputs.input_ids.squeeze()
                labels = input_ids.clone()
                labels[0] = -100
                ds.append({'input_ids': input_ids, 'labels': labels, 
                              'language': language, 'file_name': file_name, 'code': code})
    return ds

# =============================================================================

# Setup
transformer_args = fine_tuning_parameters_json['transformer_args']
tokenizer_args = fine_tuning_parameters_json['tokenizer_args']

#TODO: IF there are more named arguments, add them to the dictionary and pass them to the model and tokenizer

tokenizer_name = tokenizer_args['tokenizer_name']
model_name = transformer_args['model_name']

tokenizer = Tokenizer_Family.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.pad_token if tokenizer.pad_token else tokenizer.eos_token

model = Model_Family.from_pretrained(model_name)    
model = model.cuda()

# =============================================================================
# Data
data_path = fine_tuning_parameters_json['data_path']
languages = fine_tuning_parameters_json['languages']
max_seq_len = fine_tuning_parameters_json['max_seq_len']
repo_count = fine_tuning_parameters_json['repo_count']
val_size = fine_tuning_parameters_json['val_size']

data = pd.read_csv(data_path)
ds = process_dataframe(data, languages, max_seq_len, tokenizer, repo_count)
val_size = int(val_size * len(ds))
train_size = len(ds) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(ds, [train_size, val_size])

# =============================================================================
# Hyperparameters
batch_size = fine_tuning_parameters_json['batch_size']
lr = fine_tuning_parameters_json['optimizer_args']['lr']
epochs = fine_tuning_parameters_json['epochs']
warmup = fine_tuning_parameters_json['scheduler_args']['num_warmup_steps']

# Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Optimizer
#TODO: Use the optimizer_args dictionary to pass the arguments to the optimizer
optimizer = Optimizer_Family(model.parameters(), lr=lr)

# Scheduler
#TODO: Use the scheduler_args dictionary to pass the arguments to the scheduler
scheduler = Scheduler_Family(optimizer, num_warmup_steps=warmup, num_training_steps=len(train_dataloader)*epochs)

# =============================================================================
# Training
for epoch in tqdm(range(epochs), desc='Epochs'):
    train(train_dataloader, optimizer)
    val_loss = evaluate(val_dataloader)
    print(f'Epoch: {epoch}, Val Loss: {val_loss}')
