In [1]:

# !pip install -q -U transformers accelerate bitsandbytes peft datasets safetensors sentencepiece evaluate huggingface_hub
!pip install -q -U "transformers>=4.38.0" "datasets>=2.18.0" "peft>=0.9.0" "bitsandbytes>=0.41.0" "accelerate>=0.28.0" "safetensors" "sentencepiece" "evaluate" "huggingface_hub"



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.8/485.8 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K 

In [4]:
# imports + device
import os
import math
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, BitsAndBytesConfig
)
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
)
from huggingface_hub import notebook_login
from tqdm.auto import tqdm
from huggingface_hub import login, HfFolder

print("Torch:", torch.__version__, " CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

login(token=HF_TOKEN) 
HfFolder.save_token(HF_TOKEN) 

Torch: 2.6.0+cu124  CUDA available: True
GPU: Tesla T4


In [5]:
# Some globals
MODEL_NAME = "codellama/CodeLlama-7b-hf"
OUTPUT_DIR = "/kaggle/working/ccs_codellama7b_lora"
DATASET_ID = "rsh-raj/ccs_dataset_summarised_diff"
MAX_SEQ_LEN = 1024
PER_DEVICE_BATCH_SIZE = 1
NUM_EPOCHS =3
LEARNING_RATE = 3e-4
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
SEED = 42
torch.manual_seed(SEED)


<torch._C.Generator at 0x7e3c301b3550>

In [6]:
#dataset
ds = load_dataset(DATASET_ID)

# ds["train"][0]


train.csv:   0%|          | 0.00/34.0M [00:00<?, ?B/s]

val.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400 [00:00<?, ? examples/s]

In [8]:
# Prompt template used for finetuning purpose and aslo for inferencing
PROMPT_HEAD = (
    f"<s>[INST] <<SYS>>\n"
    f"You are a commit classifier based on commit message and code diff."
    f"Please classify the given commit into one of the ten categories: docs, perf, style, refactor, feat, fix, test, ci, build, and chore. The definitions of each category are as follows:\n"
    f"**feat**: Code changes aim to introduce new features to the codebase, encompassing both internal and user-oriented features.\n"
    f"**fix**: Code changes aim to fix bugs and faults within the codebase.\n"
    f"**perf**: Code changes aim to improve performance, such as enhancing execution speed or reducing memory consumption.\n"
    f"**style**: Code changes aim to improve readability without affecting the meaning of the code. This type encompasses aspects like variable naming, indentation, and addressing linting or code analysis warnings.\n"
    f"**refactor**: Code changes aim to restructure the program without changing its behavior, aiming to improve maintainability. To avoid confusion and overlap, we propose the constraint that this category does not include changes classified as ``perf'' or ``style``.\n"
    f"**docs**: Code changes that modify documentation or text, such as correcting typos, modifying comments, or updating documentation.\n"
    f"**test**: Code changes that modify test files, including the addition or updating of tests.\n"
    f"**ci**: Code changes to CI configuration files and scripts, such as configuring or updating CI/CD scripts.\n"
    f"**build**: Code changes affecting the build system (e.g., Maven, Gradle). Change examples include updating dependencies, configuring build configurations, and adding scripts.\n"
    f"**chore**: Code changes for other miscellaneous tasks that do not neatly fit into any of the above categories.\n"
    f"<</SYS>>\n\n"
)
PROMPT_COMMIT_MESSAGE = f"- given commit message:\n{{message}}\n"
PROMPT_COMMIT_DIFF = f"- given commit diff: \n{{diff}}\n"

def applyTemplate(sample):
    return {
        "prompt_commit_message": PROMPT_COMMIT_MESSAGE.format(message=sample["masked_commit_message"]),
        "prompt_commit_diff": PROMPT_COMMIT_DIFF.format(diff=sample["git_diff"]),
        "response": f"[/INST] {sample['annotated_type']} </s>",
    }
# def build_prompt(message, diff):
#     return PROMPT_HEAD + PROMPT_COMMIT_MESSAGE.format(message=message) + PROMPT_COMMIT_DIFF.format(diff=diff)
# temp = applyTemplate(ds["train"][0])
# print(tmp["prompt_commit_message"][:200])
# print("response:", tmp["response"])


In [9]:
# Map template i.e apply template to all the dataset
for i in ds.keys():
    ds[i] = ds[i].map(lambda x: applyTemplate(x))
print("Columns after template:", ds["train"].column_names)


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Columns after template: ['commit_message', 'sha', 'type', 'annotated_type', 'masked_commit_message', 'git_diff', 'summarised_git_diff', 'prompt_commit_message', 'prompt_commit_diff', 'response']


In [11]:
# Tokenizer load & adjustments
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False, use_fast=False)
# ensure special tokens exist
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({"eos_token": "</s>"})
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token




tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [12]:
# Tokenization helper to add labels
def tokenize_add_label(sample, tokenizer=tokenizer, max_seq_len=MAX_SEQ_LEN):
    prompt_head_ids = tokenizer.encode(PROMPT_HEAD, add_special_tokens=False)
    message_ids = tokenizer.encode(sample["prompt_commit_message"], max_length=64, truncation=True, add_special_tokens=False)
    response_ids = tokenizer.encode(sample["response"], max_length=20, truncation=True, add_special_tokens=False)
    remaining = max_seq_len - (len(prompt_head_ids) + len(message_ids) + len(response_ids))
    if remaining < 0:
        message_ids = message_ids[:32]
        remaining = max_seq_len - (len(prompt_head_ids) + len(message_ids) + len(response_ids))
        if remaining < 0:
            response_ids = response_ids[:8]
            remaining = max_seq_len - (len(prompt_head_ids) + len(message_ids) + len(response_ids))
            remaining = max(0, remaining)
    diff_ids = tokenizer.encode(sample["prompt_commit_diff"], max_length=remaining, truncation=True, add_special_tokens=False)
    seq = prompt_head_ids + message_ids + diff_ids + response_ids
    pad_len = max_seq_len - len(seq)
    if pad_len > 0:
        seq += [tokenizer.eos_token_id] * pad_len
    attention_mask = [1]*max_seq_len
    labels = [-100] * (len(prompt_head_ids) + len(message_ids) + len(diff_ids)) + response_ids + [-100] * pad_len
    return {"input_ids": seq, "attention_mask": attention_mask, "labels": labels}

# Apply tokenization to all dataset
# for i in ds.keys():
for i in ds.keys():
    print("Tokenizing key:", i)
    ds[i] = ds[i].map(lambda x: tokenize_add_label(x), remove_columns=ds[i].column_names)
    ds[i].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Tokenization finished.")
ds["train"][0]


Tokenizing key: train


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Tokenizing key: validation


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing key: test


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenization finished.


{'input_ids': tensor([    1, 29961, 25580,  ...,   271, 29871,     2]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'labels': tensor([ -100,  -100,  -100,  ...,   271, 29871,     2])}

In [13]:
# BitsAndBytes / quantization config + load model
from transformers import BitsAndBytesConfig

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.float16
# )
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
    torch_dtype=torch.float16
)

print("Model loaded.")


config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model loaded.


In [None]:
# Prepare our model and apply LoRA (PEFT)
model = prepare_model_for_kbit_training(model)

target_modules = ["q_proj", "v_proj"]

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=target_modules,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)


In [None]:
# Training arguments & trainer
from transformers import default_data_collator

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_strategy="steps",
    logging_steps=50,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    data_collator=default_data_collator
)
print("Trainer created.")


In [None]:
# Finetune the model and save
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
repo_id="allout2726/my-finetuned-model"
model.push_to_hub(repo_id, use_auth_token=HF_TOKEN)
tokenizer.push_to_hub(repo_id, use_auth_token=HF_TOKEN)
print("Training finished and LoRA adapter saved to", OUTPUT_DIR)


In [None]:
# !pip install -q -U "transformers>=4.38.0" "datasets>=2.18.0" "peft>=0.9.0" "bitsandbytes>=0.41.0" "accelerate>=0.28.0" "safetensors" "sentencepiece" "evaluate" "huggingface_hub"
# import os
# import math
# import torch
# from datasets import load_dataset
# from transformers import (
#     AutoTokenizer, AutoModelForCausalLM,
#     TrainingArguments, Trainer, BitsAndBytesConfig
# )
# from peft import (
#     LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
# )
# from huggingface_hub import notebook_login
# from tqdm.auto import tqdm
#inferencing the model on validation dataset
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load base again
base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.float16)
base = PeftModel.from_pretrained(base, OUTPUT_DIR, device_map="auto")  # load LoRA adapter
base.eval()
import re
# List of all possible types for classification
TYPES_LIST = ["docs","perf","style","refactor","feat","fix","test","ci","build","chore"]
def build_prompt(message, diff):
    return PROMPT_HEAD + PROMPT_COMMIT_MESSAGE.format(message=message) + PROMPT_COMMIT_DIFF.format(diff=diff)

# def extractType(text):
#     lowered = text.lower()
#     for t in TYPES_LIST:
#         if (" " + t + " ") in (" " + lowered + " "):
#             return t
#     toks = lowered.replace("[/inst]","").replace("</s>","").strip().split()
#     for w in toks[:10]:
#         if w in TYPES_LIST:
#             return w
#     return None

def extractType(text):
    lowered = text.lower()

    # find all occurrences of [/inst], take last one
    matches = list(re.finditer(r'\[/inst\]', lowered, flags=re.IGNORECASE))
    if matches:
        tail = lowered[matches[-1].end():]  # substring after last [/INST]
    else:
        # no anchor found: fallback to searching entire text but prefer later text.
        # choose the second half of the output to reduce chance of hitting the prompt header.
        mid = max(0, len(lowered) // 2)
        tail = lowered[mid:]

    # remove common closing special tokens and anything after them
    tail = re.split(r'<\/?s>', tail)[0]

    # Trim whitespace at edges
    tail = tail.strip()

    # Search for any allowed type as a whole word in the tail
    for t in TYPES_LIST:
        # \b ensures whole-word match
        if re.search(r'\b' + re.escape(t) + r'\b', tail):
            return t

    # Fallback: look at the first few whitespace-separated tokens and clean punctuation
    tokens = re.split(r'\s+', tail)[:8]
    for tok in tokens:
        cand = re.sub(r'[^a-zA-Z]', '', tok).lower()
        if cand in TYPES_LIST:
            return cand

    # final fallback: search entire text for whole word match (rare)
    for t in TYPES_LIST:
        if re.search(r'\b' + re.escape(t) + r'\b', lowered):
            return t

    return None

# evaluate on validation
test_ds = load_dataset(DATASET_ID)["validation"]# right now inferencing on just 100 samples
correct = 0
total = 0
for sample in tqdm(test_ds):
    prompt = build_prompt(sample["masked_commit_message"], sample["git_diff"])
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LEN).to(base.device)
    with torch.no_grad():
        out = base.generate(**inputs, max_new_tokens=16, do_sample=False, temperature=0.0, eos_token_id=tokenizer.eos_token_id,pad_token_id=tokenizer.eos_token_id,)
    text_output=tokenizer.decode(out[0], skip_special_tokens=False)
    pred = extractType(text_output)
    actual_type = sample["annotated_type"].lower()
    # print(f"output :",text_output)
    print(f"pred: ",pred)
    print(f"actual: ",actual_type)

    if pred == actual_type:
        correct += 1
    total += 1

print(f"Test accuracy: {100*correct/total:.2f}% ({correct}/{total})")


In [14]:
# Inference cell — loads your fine-tuned LoRA adapter from Hugging Face Hub and runs eval.

import os
import re
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

repo_id = "allout2726/my-finetuned-model"   
hf_token = None
try:
    # hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    # os.environ["HF_TOKEN"] = hf_token
    login(HF_TOKEN)
    print("Authenticated to Hugging Face via Kaggle Secrets.")
except Exception as e:
    hf_token = os.environ.get("HF_TOKEN", None)
    if hf_token:
        try:
            login(hf_token)
            print("Authenticated to Hugging Face via HF_TOKEN env var.")
        except Exception:
            print("HF token present but login failed. Continuing — repo may be public or token not required.")
    else:
        print("No HF token found in Kaggle Secrets or environment. If the repo is private, loading will fail.")

print("Loading tokenizer from:", repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id, use_auth_token=HF_TOKEN)

print("Loading base model:", MODEL_NAME)
base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    use_auth_token=HF_TOKEN
)

print("Loading LoRA adapter from Hub repo:", repo_id)
base = PeftModel.from_pretrained(
    base,
    repo_id,
    device_map="auto",
    torch_dtype=torch.float16,
    use_auth_token=HF_TOKEN
)

base.eval()
print("Model + adapter loaded. Device:", next(base.parameters()).device)

# helper to build prompt 
def build_prompt(message, diff):
    return PROMPT_HEAD + PROMPT_COMMIT_MESSAGE.format(message=message) + PROMPT_COMMIT_DIFF.format(diff=diff)

TYPES_LIST = ["docs","perf","style","refactor","feat","fix","test","ci","build","chore"]
def extractType(text):
    lowered = text.lower()
    matches = list(re.finditer(r'\[/inst\]', lowered, flags=re.IGNORECASE))
    if matches:
        tail = lowered[matches[-1].end():]
    else:
        mid = max(0, len(lowered) // 2)
        tail = lowered[mid:]
    tail = re.split(r'<\/?s>', tail)[0].strip()
    for t in TYPES_LIST:
        if re.search(r'\b' + re.escape(t) + r'\b', tail):
            return t
    tokens = re.split(r'\s+', tail)[:8]
    for tok in tokens:
        cand = re.sub(r'[^a-zA-Z]', '', tok).lower()
        if cand in TYPES_LIST:
            return cand
    for t in TYPES_LIST:
        if re.search(r'\b' + re.escape(t) + r'\b', lowered):
            return t
    return None

# 3) Evaluate on validation dataset 
test_ds = load_dataset(DATASET_ID)["test"]
correct = 0
total = 0

for sample in tqdm(test_ds):
    prompt = build_prompt(sample["masked_commit_message"], sample["git_diff"])
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LEN).to(next(base.parameters()).device)
    with torch.no_grad():
        out = base.generate(
            **inputs,
            max_new_tokens=16,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    text_output = tokenizer.decode(out[0], skip_special_tokens=False)
    pred = extractType(text_output)
    actual_type = sample["annotated_type"].lower()

    print("pred: ", pred)
    print("actual:", actual_type)

    if pred == actual_type:
        correct += 1
    total += 1

print(f"Test accuracy: {100*correct/total:.2f}% ({correct}/{total})")


Authenticated to Hugging Face via Kaggle Secrets.
Loading tokenizer from: allout2726/my-finetuned-model




Loading base model: codellama/CodeLlama-7b-hf




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA adapter from Hub repo: allout2726/my-finetuned-model


adapter_config.json:   0%|          | 0.00/859 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Model + adapter loaded. Device: cuda:0


  0%|          | 0/400 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


pred:  perf
actual: perf
pred:  ci
actual: ci
pred:  ci
actual: ci
pred:  build
actual: chore
pred:  refactor
actual: feat
pred:  fix
actual: fix
pred:  fix
actual: fix
pred:  feat
actual: feat
pred:  test
actual: test
pred:  ci
actual: ci
pred:  docs
actual: chore
pred:  feat
actual: feat
pred:  refactor
actual: refactor
pred:  refactor
actual: refactor
pred:  perf
actual: perf
pred:  test
actual: test
pred:  ci
actual: ci
pred:  style
actual: style
pred:  feat
actual: feat
pred:  ci
actual: ci
pred:  style
actual: style
pred:  test
actual: test
pred:  test
actual: test
pred:  style
actual: style
pred:  style
actual: style
pred:  perf
actual: perf
pred:  build
actual: build
pred:  test
actual: test
pred:  fix
actual: fix
pred:  build
actual: build
pred:  refactor
actual: refactor
pred:  perf
actual: perf
pred:  feat
actual: feat
pred:  chore
actual: chore
pred:  fix
actual: refactor
pred:  perf
actual: perf
pred:  feat
actual: feat
pred:  docs
actual: docs
pred:  test
actual: test
pre

In [1]:
!ls -la "/kaggle/working/ccs_codellama7b_lora"

ls: cannot access '/kaggle/working/ccs_codellama7b_lora': No such file or directory
