**Install Required Libraries**

In [None]:
!pip install -q datasets accelerate evaluate torch
!pip install -U -q transformers timm
!pip install -q rouge_score jiwer
!pip install -q peft
!pip install -q -U bitsandbytes
!pip install -q trl

**Obtain Hugging Face Access**

In [None]:
# Grant access to hugging face
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')


**Prepare the Dataset**

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('Nan-Do/instructional_code-search-net-java')

# Check the column names
print(dataset['train'].column_names)

**Load the Model and Tokenizer**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from accelerate import PartialState

# Load the Starcoder model and tokenizer

model_name = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)

# Set for 4-bit quantization
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

# Load the model with the specified BitsAndBytesConfig
model = AutoModelForCausalLM.from_pretrained(
        "bigcode/starcoder2-3b",
        quantization_config=bnb_config,
        device_map={"": PartialState().process_index}
    )

In [None]:
# Add a padding token to the tokenizer before preprocessing
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Update model embeddings to include the new padding token
model.resize_token_embeddings(len(tokenizer))

def preprocess_function(examples):
    # Tokenizing the inputs (Instruction column)
    inputs = tokenizer(examples['INSTRUCTION'], padding="max_length", truncation=True, max_length=512)

    # Tokenizing the responses (this will be used as labels for supervised learning)
    # We shift the labels by one position to the right
    # because we want the model to predict the next token in the sequence
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['RESPONSE'], padding="max_length", truncation=True, max_length=512)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels list.
    inputs["labels"] = labels["input_ids"]
    # We want to ignore padding tokens in the loss, so we set them to -100
    inputs["labels"] = [
        -100 if token == tokenizer.pad_token_id else token for token in inputs["labels"]
    ]

    return inputs
# Apply preprocessing
train_dataset = dataset['train'].map(preprocess_function, batched=True)

In [None]:
train_eval_split = train_dataset.train_test_split(test_size=0.2)
train_dataset = train_eval_split['train']
eval_dataset = train_eval_split['test']

**Set Up Training Arguments**

In [None]:
import argparse

def get_args():
    parser = argparse.ArgumentParser()

    # Model
    parser.add_argument("--model_id", type=str, default="bigcode/starcoder2-3b")  # Default to StarCoder2-3b

    # Training Parameters
    parser.add_argument("--max_seq_length", type=int, default=512)  # Maximum sequence length for input and labels
    parser.add_argument("--max_steps", type=int, default=2000)  # Total training steps
    parser.add_argument("--micro_batch_size", type=int, default=1)  # Batch size per device
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4)  # Gradient accumulation steps
    parser.add_argument("--weight_decay", type=float, default=0.01)  # Weight decay for regularization
    parser.add_argument("--bf16", type=bool, default=True)  # Use bfloat16 precision

    # Optimizer and Learning Rate
    parser.add_argument("--attention_dropout", type=float, default=0.1)  # Dropout for attention layers
    parser.add_argument("--learning_rate", type=float, default=2e-4)  # Learning rate
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")  # Learning rate scheduler type
    parser.add_argument("--warmup_steps", type=int, default=100)  # Warmup steps for learning rate

    # Other Settings
    parser.add_argument("--seed", type=int, default=0)  # Random seed for reproducibility
    parser.add_argument("--output_dir", type=str, default="./starcoder_finetuned")  # Output directory for saving the model
    parser.add_argument("--num_proc", type=int, default=None)  # Number of processes for data preprocessing (if applicable)
    parser.add_argument("--push_to_hub", type=bool, default=False)  # Whether to push the model to the Hugging Face Hub

    # Parse the arguments
    args = parser.parse_args([])
    return args

In [None]:
# from transformers import TrainingArguments

# # Training Arguments
# training_args = TrainingArguments(
#     output_dir="./starcoder_finetuned",  # Directory to save model outputs
#     num_train_epochs=30,                  # Number of training epochs
#     per_device_train_batch_size=1,       # Batch size per GPU (adjust based on memory)
#     gradient_accumulation_steps=8,       # Accumulate gradients to simulate larger batch size
#     evaluation_strategy="steps",         # Evaluate during training
#     save_strategy="steps",               # Save model checkpoints during training
#     logging_dir="./logs",                # Directory for logs
#     logging_steps=50,                    # Log every 50 steps
#     save_steps=500,                      # Save model every 500 steps
#     learning_rate=5e-5,                  # Learning rate
#     weight_decay=0.01,                   # Weight decay
#     fp16=True,                           # Use mixed precision for faster training
#     push_to_hub=False                    # Set to True if pushing the model to Hugging Face Hub
# )

**Define Metrics**

In [None]:
from evaluate import load

# Load various metrics
metrics = {
    "accuracy": load("accuracy"),
    "perplexity": load("perplexity"),
    "rouge": load("rouge"),
    "bleu": load("bleu"),
    "f1": load("f1"),
    "precision": load("precision"),
    "recall": load("recall"),
    "meteor": load("meteor"),
    "wer": load("wer"),
}

def compute_metrics(p):
    predictions, labels = p
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Initialize a dictionary to store the results
    results = {}

    # Compute metrics for the decoded outputs
    for metric_name, metric in metrics.items():
        if metric_name == "accuracy":
            # For accuracy, compute using the predicted indices
            # predictions is usually logits, so we need to use argmax to get the predicted token ids
            pred_ids = predictions.argmax(axis=-1)
            results[metric_name] = metric.compute(predictions=pred_ids, references=labels)
        else:
            # For other metrics like ROUGE, BLEU, we use the decoded text
            results[metric_name] = metric.compute(predictions=decoded_preds, references=decoded_labels)

    return results


**Define Trainer**

In [None]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
        r=8,
        target_modules=[
            "q_proj",
            "o_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        task_type="CAUSAL_LM",
    )

# Apply the PEFT model with QLoRA
model = get_peft_model(model, lora_config)


In [None]:
from trl import SFTTrainer
import transformers

args = get_args()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # dataset_text_field="RESPONSE",  # Removed: This argument is deprecated
    # Instead, pass the text column name to formatting_func:
    formatting_func=lambda examples: tokenizer(examples["RESPONSE"], padding="max_length", truncation=True, max_length=args.max_seq_length),
    # max_seq_length=args.max_seq_length,  # Removed: This argument is no longer needed in SFTTrainer
    peft_config=lora_config,  # LoRA configuration
    args=transformers.TrainingArguments(
        per_device_train_batch_size=args.micro_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        warmup_steps=args.warmup_steps,
        max_steps=args.max_steps,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        weight_decay=args.weight_decay,
        bf16=args.bf16,  # Use bf16 precision
        logging_strategy="steps",
        logging_steps=10,  # Log every 10 steps
        evaluation_strategy="steps",
        eval_steps=100,  # Evaluate every 100 steps
        save_strategy="steps",
        save_steps=100,  # Save every 100 steps
        output_dir=args.output_dir,  # Output directory
        optim="paged_adamw_8bit",  # Optimizer
        seed=args.seed,  # Random seed
        run_name=f"train-{args.model_id.split('/')[-1]}",  # Run name
        report_to="wandb",  # Report to Weights & Biases (if installed)
        load_best_model_at_end=True,  # Load the best model at the end
    ),
)

**Start Fine-Tuning**

In [None]:
# Using accelerator to offload certain parts
# from accelerate import Accelerator
# from torch.utils.data import DataLoader
# optimizer = AdamW(model.parameters(), lr=5e-5)
# accelerator = Accelerator()
# train_dataloader = DataLoader(train_dataset, batch_size=16)
# model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)
trainer.train()

**Evaluate the model**

In [None]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# Print all the evaluation metrics
print("Evaluation results:", eval_results)


**Push the Model into Hugging Face**

In [None]:
# Push my fine tuned model with its tokenizer to my hugging face repo
repo_path="BSAtlas/BSCode-1-Stable"
model.push_to_hub(
   repo_path,
   token=HF_TOKEN
)
tokenizer.push_to_hub(
  repo_path,
  token=HF_TOKEN
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name = "./starcoder_finetuned"  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Function to generate documentation and optimize Java code
def generate_documentation_and_optimization(input_text, max_length=1024):
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    # Ensure the model runs on the right device (GPU if available)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate output tokens (with beam search, greedy search, etc.)
    output = model.generate(**inputs, max_length=max_length, num_return_sequences=1, top_k=50, top_p=0.95, temperature=0.7)

    # Decode the generated tokens back to text
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    return decoded_output

# Java code to pass to the model
input_code = """
Given the following Java code, first generate the documentation in the form of comments for the class and methods. Then, optimize the code to improve efficiency or readability:

public class FactorialCalculator {
    public static int calculateFactorial(int n) {
        if (n == 0) return 1;
        return n * calculateFactorial(n - 1);
    }

    public static void main(String[] args) {
        int number = 5;
        System.out.println("Factorial of " + number + " is " + calculateFactorial(number));
    }
}
"""

# Get the model's response
response = generate_documentation_and_optimization(input_code)

# Print the generated documentation and optimized code
print("Generated Documentation and Optimized Code:")
print(response)
