In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from datasets import load_dataset

In [None]:
#no need to run
dataset = load_dataset('csv', data_files={
    'train': '/content/drive/MyDrive/code_completion.csv',
    'validation': '/content/drive/MyDrive/code_completion.csv'
})
print(dataset)
print(dataset['train'].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_set', 'output_set'],
        num_rows: 35269
    })
    validation: Dataset({
        features: ['input_set', 'output_set'],
        num_rows: 35269
    })
})
['input_set', 'output_set']


In [None]:
#no need to run
dataset = load_dataset('csv', data_files={
    'train': '/content/drive/MyDrive/debugging.csv',
    'validation': '/content/drive/MyDrive/debugging.csv'
})
print(dataset)
print(dataset['train'].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['buggy_code', 'corrected_code'],
        num_rows: 26891
    })
    validation: Dataset({
        features: ['buggy_code', 'corrected_code'],
        num_rows: 26891
    })
})
['buggy_code', 'corrected_code']


In [None]:
#no need to run
dataset = load_dataset('csv', data_files={
    'train': '/content/drive/MyDrive/test_generation.csv',
    'validation': '/content/drive/MyDrive/test_generation.csv'
})
print(dataset)
print(dataset['train'].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['function', 'test_case'],
        num_rows: 16284
    })
    validation: Dataset({
        features: ['function', 'test_case'],
        num_rows: 16284
    })
})
['function', 'test_case']


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import pandas as pd

# =========================
# CONFIGURATION
# =========================
model_checkpoint = "Salesforce/codegen-350M-multi"  # You can change this
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # For causal LM

# Dataset paths and column names
dataset_configs = {
    "completion": {
        "path": "/content/drive/MyDrive/code_completion.csv",
        "input_col": "input_set",
        "output_col": "output_set"
    },
    "bugfix": {
        "path": "/content/drive/MyDrive/debugging.csv",
        "input_col": "buggy_code",
        "output_col": "corrected_code"
    },
    "testcase": {
        "path": "/content/drive/MyDrive/test_generation.csv",
        "input_col": "function",
        "output_col": "test_case"
    }
}

# =========================
# LOAD & TOKENIZE FUNCTION
# =========================
def load_and_tokenize(path, input_col, output_col):
    # Read CSV
    df = pd.read_csv(path)
    if input_col not in df.columns or output_col not in df.columns:
        raise ValueError(f"❌ Columns {input_col} or {output_col} not found in {path}")

    # Merge input and output into one "text" field for LM training
    df["text"] = df[input_col].astype(str) + " " + df[output_col].astype(str)

    # Save temp CSV with only text column (datasets lib needs it)
    temp_path = "/content/drive/MyDrive/temp_dataset.csv"
    df[["text"]].to_csv(temp_path, index=False)

    # Load into HuggingFace Dataset
    dataset = load_dataset("csv", data_files=temp_path)

    # Tokenization
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_datasets

# =========================
# TRAINING FUNCTION
# =========================
def train_and_save(tokenized_datasets, save_dir):
    model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    training_args = TrainingArguments(
        output_dir=save_dir,
        overwrite_output_dir=True,
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.01,
        gradient_accumulation_steps=4,
        logging_dir=f"{save_dir}/logs",
        logging_steps=50,
        save_total_limit=2,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["train"],  # Using same dataset for eval here
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"✅ Model saved at {save_dir}")

# =========================
# TRAIN EACH MODEL
# =========================
print("\n===== Training Auto Code Completion Model =====")
cfg = dataset_configs["completion"]
dataset_completion = load_and_tokenize(cfg["path"], cfg["input_col"], cfg["output_col"])
train_and_save(dataset_completion, "/content/drive/MyDrive/models/code_completion_model")



===== Training Auto Code Completion Model =====


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/35269 [00:00<?, ? examples/s]

Some weights of the model checkpoint at Salesforce/codegen-350M-multi were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgegeee987[0m ([33mgegeee987-bangalore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.4182
100,0.3369
150,0.3237
200,0.3152
250,0.3062
300,0.283
350,0.294
400,0.287
450,0.2693
500,0.2749


Step,Training Loss
50,0.4182
100,0.3369
150,0.3237
200,0.3152
250,0.3062
300,0.283
350,0.294
400,0.287
450,0.2693
500,0.2749


✅ Model saved at /content/drive/MyDrive/models/code_completion_model


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
model_name = "/content/drive/MyDrive/models/code_completion_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Prompt for incomplete Java function
java_prompt = """public class MatrixPrint {
    public static void main(String[] args) {
        // Define a 2x2 matrix
        """

# Function to trim after full function ends
def trim_after_function(decoded: str) -> str:
    brace_count = 0
    trimmed_code = []
    inside_function = False

    for line in decoded.splitlines():
        trimmed_code.append(line)

        if '{' in line:
            brace_count += line.count('{')
            inside_function = True

        if '}' in line:
            brace_count -= line.count('}')
            if inside_function and brace_count == 0:
                break  # Function end reached

    return "\n".join(trimmed_code).strip()

# Tokenize and generate
inputs = tokenizer(java_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_length=350,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and clean
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
decoded = trim_after_function(decoded)

print(decoded)


public class MatrixPrint {
    public static void main(String[] args) {
        // Define a 2x2 matrix        
        int[][] matrix = new int[2][2];

        // Initialize column by row
        matrix[0][0] = 1;
        matrix[0][1] = 1;
        matrix[1][0] = 2;
        matrix[1][1] = 2;
        
        // Print the matrix
        for(int i = 0; i < 2; i++) {
            for(int j = 0; j < 2; j++) {
                System.out.println(String.format("%4d %4d", matrix[i][j], matrix[i][j + 1]));
            }
        }
    }
}
