# Imports

In [3]:
!pip install -r /kaggle/input/requirements-txt/requirements.txt
!nvidia-smi

Collecting tree-sitter (from -r /kaggle/input/requirements-txt/requirements.txt (line 10))
  Downloading tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Collecting tree-sitter-python (from -r /kaggle/input/requirements-txt/requirements.txt (line 11))
  Downloading tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting tree-sitter-java (from -r /kaggle/input/requirements-txt/requirements.txt (line 12))
  Downloading tree_sitter_java-0.23.5-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Collecting evaluate (from -r /kaggle/input/requirements-txt/requirements.txt (line 22))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (574 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset

from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, TaskType

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve, average_precision_score

# WandB
import wandb

# AST
from tree_sitter import Language, Parser
import tree_sitter_python
import tree_sitter_java

## AST Graphing
import graphviz

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Datasets
from pathlib import Path
from datasets import load_dataset, load_from_disk, DatasetDict, concatenate_datasets



# Data Loading

## Python Data

In [5]:
# Python loading
python_dataset = DatasetDict({
    'train': load_dataset('code_search_net', 'python', split='train[:15000]', trust_remote_code=True),
    'validation': load_dataset('code_search_net', 'python', split='validation[:2000]', trust_remote_code=True),
    'test': load_dataset('code_search_net', 'python', split='test[:2000]', trust_remote_code=True)
})

python_dataset

README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 2000
    })
})

## Java Data

In [6]:
# Java loading
java_dataset = DatasetDict({
    'train': load_dataset('code_search_net', 'java', split='train[:15000]', trust_remote_code=True),
    'validation': load_dataset('code_search_net', 'java', split='validation[:2000]', trust_remote_code=True),
    'test': load_dataset('code_search_net', 'java', split='test[:2000]', trust_remote_code=True)
})

java_dataset

java.zip:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/454451 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/26909 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15328 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 2000
    })
})

## Debug and Test modes

In [7]:
# set to False for full training config
debug = False
# set tp True for enabling testing blocks
test_run = False

# Concatenation

In [8]:
combined_dataset = DatasetDict({
    'train': concatenate_datasets([python_dataset['train'], java_dataset['train']]),
    'validation': concatenate_datasets([python_dataset['validation'], java_dataset['validation']]),
    'test': concatenate_datasets([python_dataset['test'], java_dataset['test']])
})

if debug:
    combined_dataset["train"] = combined_dataset["train"].select(range(200))
    combined_dataset["validation"] = combined_dataset["validation"].select(range(50))
    combined_dataset["test"] = combined_dataset["test"].select(range(50))

combined_dataset['train'] = combined_dataset['train'].shuffle(seed=42)
combined_dataset['validation'] = combined_dataset['validation'].shuffle(seed=42)
combined_dataset['test'] = combined_dataset['test'].shuffle(seed=42)

combined_dataset['train']['func_code_string'][1]

'def add_vip(\n            self,\n            id,\n            real_name_sufixo,\n            id_vlan,\n            descricao_vlan,\n            id_vlan_real,\n            descricao_vlan_real,\n            balanceadores,\n            id_healthcheck_expect,\n            finalidade,\n            cliente,\n            ambiente,\n            cache,\n            metodo_bal,\n            persistencia,\n            healthcheck_type,\n            healthcheck,\n            timeout,\n            host,\n            maxcon,\n            dsr,\n            bal_ativo,\n            transbordos,\n            portas,\n            real_maps,\n            id_requisicao_vip,\n            areanegocio=\'Orquestra\',\n            nome_servico=\'Orquestra\',\n            l7_filter=None,\n            reals_prioritys=None,\n            reals_weights=None):\n        """Adiciona um VIP na lista de VIPs para operação de inserir/alterar um grupo virtual.\n\n        Os parâmetros abaixo somente são necessários para a

# AST Integration, Masking & Preprocessing Functions

In [9]:
def mask_func_name(code_str: str, func_name: str, lang: str) -> str:
    lang = lang.lower()

    if lang == 'python':
        pattern = rf"(def\s+)({re.escape(func_name)})(\s*\()"
        return re.sub(pattern, r"\1<extra_id_0>\3", code_str, count=1)
    
    elif lang == 'java':
        pattern = rf"(?<!\w){re.escape(func_name)}(?=\s*\()"
        return re.sub(pattern, "<extra_id_0>", code_str, count=1)

    else:
        return code_str

def test_real_python_samples(dataset, num_samples=3):
    print("=== REAL PYTHON SAMPLES ===")
    
    for i in range(num_samples):
        full_func_name = dataset['train'][i]['func_name'] 
        method_name = full_func_name.split('.')[-1] 
        code = dataset['train'][i]['func_code_string']
        
        print(f"--- Sample #{i} ---")
        print(f"Original Function Name: {full_func_name}")
        print("\nOriginal Code:\n", code)
        print("\nMasked Code:\n", mask_func_name(code, method_name, lang="python"))
        print("=" * 100 + "\n")

def test_real_java_samples(dataset, num_samples=3):
    print("=== REAL JAVA SAMPLES ===")
    
    for i in range(num_samples):
        full_func_name = dataset['train'][i]['func_name']
        method_name = full_func_name.split('.')[-1]
        code = dataset['train'][i]['func_code_string']
        
        print(f"--- Sample #{i} ---")
        print(f"Original Function Name: {full_func_name}")
        print("\nOriginal Code:\n", code)
        print("\nMasked Code:\n", mask_func_name(code, method_name, lang="java"))
        print("=" * 100 + "\n")


def inspect_samples(dataset, lang: str, num_samples: int = 5):
    print(f"\n=== {lang.upper()} SAMPLE VERIFICATION ===\n")
    for i in range(num_samples):
        sample = dataset['train'][i]
        code = sample['func_code_string']
        full_name = sample['func_name']
        method_name = full_name.split('.')[-1]

        masked_code = mask_func_name(code, method_name, lang)
        tokens = tokenizer.tokenize(masked_code)

        print(f"--- Sample #{i} ---")
        print(f"Original Function Name: {full_name}")
        print("\nOriginal Code:\n", code)
        print("\nMasked Code:\n", masked_code)
        print("\nTokenized Input:\n", tokens)
        print("=" * 100)

def preprocess(examples):
    combined_inputs = []
    combined_labels = []
    
    # Iterate over each example
    for code, target, lang in zip(examples['func_code_string'], examples['func_name'], examples['language']):
         # Extract method name (in case it's fully qualified like Class.method)
        method_name = target.split('.')[-1]
        # Mask function name in definition
        masked_code = mask_func_name(code, method_name, lang)

        combined_inputs.append(masked_code)
        combined_labels.append(method_name) # Extract the method name from the full path
    
    # Tokenize the combined input and targets
    model_inputs = tokenizer(combined_inputs, max_length=1024, truncation=True, padding='max_length')
    tokenized_labels = tokenizer(combined_labels, max_length=50, truncation=True, padding='max_length')
    
    model_inputs['labels'] = tokenized_labels['input_ids']
    return model_inputs


if debug:
    # Run both inspections
    inspect_samples(python_dataset, lang="python", num_samples=5)
    inspect_samples(java_dataset, lang="java", num_samples=5)
    
    # Run the test
    test_real_java_samples(java_dataset, num_samples=5)
    test_real_python_samples(python_dataset, num_samples=5)
    
    print("<extra_id_0>" in tokenizer.get_vocab())
    print("<mask>" in tokenizer.get_vocab())
    print("Token ID for <extra_id_0>:", tokenizer.convert_tokens_to_ids("<extra_id_0>"))
    print("All special tokens:", tokenizer.special_tokens_map)
    print("Additional special tokens:", tokenizer.additional_special_tokens)



# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

# Tokenize

In [10]:
tokenized_dataset = combined_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [11]:
if debug:
    num_samples_to_show = 5
    
    for idx in range(num_samples_to_show):
        print(f"\n===== Sample {idx + 1} =====")
    
        # Print decoded input (with masking, i.e., function body with <extra_id_0>)
        input_ids = tokenized_dataset["train"][idx]["input_ids"]
        decoded_input = tokenizer.decode(input_ids, skip_special_tokens=False)
        print("▶️ Masked Input Code:\n", decoded_input)
    
        # Print decoded label (method name target)
        label_ids = tokenized_dataset["train"][idx]["labels"]
        decoded_label = tokenizer.decode(
            [id for id in label_ids if id != tokenizer.pad_token_id],
            skip_special_tokens=True
        )
        print("🎯 Target Method Name:", decoded_label)
    
        # Optional: show original method name from combined dataset (if available)
        if "func_name" in combined_dataset["train"].features:
            original_name = combined_dataset["train"][idx]["func_name"]
            print("🧾 Original Method Name:", original_name)

In [12]:
if debug:
    # Show sample
    
    print(tokenized_dataset["train"][0])
    print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))
    
    print(tokenized_dataset["train"][0])
    print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))
    
    
    sample_index = 0 
    
    # From original dataset (before masking)
    original_func_name = combined_dataset["train"][sample_index]["func_name"]
    print("Full Function Name:", original_func_name)
    
    # From label inside tokenized dataset
    label_ids = tokenized_dataset["train"][sample_index]["labels"]
    label_text = tokenizer.decode([id for id in label_ids if id != tokenizer.pad_token_id], skip_special_tokens=True)
    print("Target Label Text (after masking & preprocessing):", label_text)
    
    label_ids = tokenized_dataset['train'][0]['labels']
    label_text = tokenizer.decode([id for id in label_ids if id != tokenizer.pad_token_id], skip_special_tokens=True)
    print("Decoded Label (method name):", label_text)

# W&B

#### Make all changes to hyper-params here, pls do not change elsewhere

In [13]:
if debug:
    config = {
        "learning_rate": 5e-5,
        "batch_size": 8,
        "num_train_epochs": 1,
        "eval_steps": 20,
        "save_steps": 20,
        "save_total_limit": 1,
        "logging_steps": 10,
        "fp16": False,  # for smoke-test
        "predict_with_generate": True,
        "load_best_model_at_end": True,
        "evaluation_strategy": "steps",
        "logging_strategy": "steps",
        "save_strategy": "steps",
        "output_dir": "./debug_results",
        "report_to": "wandb",
        "run_name": "mngast120k_smoke_test",
        "model_name": "Salesforce/codet5-base"
    }
else:
    # Define hyperparameters in a dictionary
    config = {
        "learning_rate": 6e-5,
        "batch_size": 8,
        "num_train_epochs": 5,
        "eval_steps": 2500,
        "save_steps": 2500,
        "save_total_limit": 3,
        "logging_steps": 100,
        #"weight_decay": 0.01,
        "fp16": True,
        "predict_with_generate": True,
        "load_best_model_at_end": True,
        "evaluation_strategy": "steps",
        "logging_strategy": "steps",
        "save_strategy": "steps",
        "output_dir": "./training_results",
        "report_to": "wandb",
        "run_name": "mngast120k_training",
        "model_name": "Salesforce/codet5-base"
    }


In [16]:
# Log hyperparameters to W&B
wandb.login(key="a57462a99faeaf50d607f689b3eb0f9271926f41")
wandb.init(project="Method Name Prediction", name="mng_training")




In [17]:
wandb.config.update(config)

# Model Loading

In [18]:
model = T5ForConditionalGeneration.from_pretrained(config["model_name"])

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

## LoRA - Fine tuning

In [19]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        # Encoder attention part
        "q", "k", "v", "o",
        # Decoder attention part
        "decoder.q", "decoder.k", "decoder.v", "decoder.o",
        # Feed-forward network layers
        "wi", "wo",
    ],
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)

Use the code block below only if you have an input directory to load a previous checkpoint in the input tab (pls follow the file path metnioned in the checkpoint_dir variable and change the file name to your checkpoint).

In [None]:
# checkpoint_dir = "/kaggle/input/checkpoint-path/training_results/checkpoint-25000"
# model.load_adapter(checkpoint_dir, adapter_name="default")

## Training Params

In [20]:
cnfg = wandb.config

# All fields called from config dictionary
training_args = Seq2SeqTrainingArguments(
    learning_rate=cnfg.learning_rate,
    per_device_train_batch_size=cnfg.batch_size,
    per_device_eval_batch_size=cnfg.batch_size,
    num_train_epochs=cnfg.num_train_epochs,
    eval_steps=cnfg.eval_steps,
    save_steps=cnfg.save_steps,
    save_total_limit=cnfg.save_total_limit,
    logging_steps=cnfg.logging_steps,
    # weight_decay=cnfg.weight_decay,
    fp16=cnfg.fp16,
    predict_with_generate=cnfg.predict_with_generate,
    load_best_model_at_end=cnfg.load_best_model_at_end,
    eval_strategy=cnfg.evaluation_strategy,
    logging_strategy=cnfg.logging_strategy,
    save_strategy=cnfg.save_strategy,
    output_dir=cnfg.output_dir,
    report_to=cnfg.report_to,
    run_name=cnfg.run_name,
)

# Data Loader

In [21]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    processing_class=tokenizer,
    data_collator=collator,
)

# Train

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
2500,0.1036,0.111066
5000,0.1014,0.111529
7500,0.0993,0.111971
10000,0.0903,0.113855
12500,0.0881,0.114688
15000,0.0894,0.115389
17500,0.0858,0.116489


### Test-output

In [None]:
# correct = 0
# total = 0
# model.eval()
# device = model.device  # Get model's device (CPU/GPU)

# # Select first 100 examples from test set
# test_subset = tokenized_dataset["test"].select(range(50))

# with torch.no_grad():  # Disable gradient calculation for evaluation
#     for example in test_subset:
#         # Move input to same device as model
#         input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        
#         # Generate prediction with more appropriate max_length
#         generated_ids = model.generate(
#             input_ids=input_ids,
#             max_length=50,  # Function names are rarely >20 tokens
#             num_beams=3,    # Better results with beam search
#             early_stopping=True
#         )
        
#         # Decode both prediction and label
#         predicted_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
#         expected_text = tokenizer.decode(example["labels"], skip_special_tokens=True).strip()

#         # Update counts
#         if predicted_text == expected_text:
#             correct += 1
#         total += 1

#         print(f"Expected: {expected_text} | Predicted: {predicted_text}")

# accuracy = correct / total
# print(f"\nExact Match Accuracy on 1000 samples: {accuracy:.2%}")

In [24]:
if test_run:
    test_input = '''def <extra_id_0>(x, y):
        return (x ** 2 + y ** 2) ** 0.5
    '''
    test_input2 = '''public static int <extra_id_0>(int n) {
        if (n == 0) {
            return 1;
        }
        return n * <extra_id_0>(n - 1);
    }
    '''
    test_input3 = '''def <extra_id_0>(data, window_size=3):
        if len(data) < window_size:
            raise ValueError("Data length must be at least equal to the window size.")
        
        moving_averages = []
        for i in range(len(data) - window_size + 1):
            window = data[i : i + window_size]
            window_average = sum(window) / window_size
            moving_averages.append(window_average)
        
        return moving_averages
    '''
    test_input4 = '''public static int <extra_id_0>(int[] numbers) {
        int max = Integer.MIN_VALUE;
        for (int num : numbers) {
            if (num > max) {
                max = num;
            }
        }
        return max;
    }
    '''
    # Tokenize (on GPU)
    inputs = tokenizer(test_input2, return_tensors="pt").to(model.device)
    
    # Generate
    generated_ids = model.generate(**inputs, max_length=16)
    
    # Decode
    output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Predicted method name:", output_text)

Predicted method name: get_key_count(n - 1);
           


In [None]:
if debug:
    !zip -r /kaggle/working/training_checkpoints.zip /kaggle/working/debug_results
else:
    !zip -r /kaggle/working/training_checkpoints.zip /kaggle/working/training_results