In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!chmod +x setup.sh
!./setup.sh

In [None]:
project_name = 'LLM-for-code-intelligence-Project'
import os

path= f'/content/drive/MyDrive/{project_name}'

if not os.path.exists(path):
  os.mkdir(path)

os.chdir(path)

repo_name = 'LLM-for-code-intelligence'
repo_path = f'{path}/{repo_name}'
url = f'https://github.com/ammarnasr/{repo_name}.git'



if not os.path.exists(repo_path):
    #clone the repo
    print('Cloning the repo...')
    !git clone $url
else:
    #pull the repo
    print('Pulling the repo...')
    !git -C $repo_name pull

os.chdir(repo_path)

print(f'Current Dir: {os.getcwd()}')

#### Train

In [1]:
import os
# os.chdir('/content/drive/MyDrive/LLM-for-code-intelligence-Project/LLM-for-code-intelligence/Finetuning')
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from finetuning_datasets import ConstantLengthDataset

In [2]:
model_id = "Salesforce/codegen-350M-mono"
tokenizer_id = "Salesforce/codegen-350M-mono"
dataset_id = "ammarnasr/bigcode-the-stack-dedup-java-small-subset"
using_checkpoint = False
checkpoint_number = 1000
checkpoint_batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)
model_seq_length = model.config.max_position_embeddings
effective_seq_length = model_seq_length//1
print(f"Model Sequence Length: {model_seq_length}")
print(f"Effective Sequence Length: {effective_seq_length}")
dataset = load_dataset(dataset_id)
train_ds = dataset["train"]
valid_ds = dataset["valid"]
#use only the first 100 validation examples
valid_ds = valid_ds.select(list(range(100)))

#if using_checkpoint, skip the first checkpoint_number batches from the training dataset
if using_checkpoint:
    orginal_length = len(train_ds)
    indices_to_keep = list(range(checkpoint_number*checkpoint_batch_size, orginal_length))
    train_ds = train_ds.select(indices_to_keep)
    print(f"Original Length: {orginal_length}")
    print(f"New Length: {len(train_ds)}")
    print(f"Precentage of data used: {len(train_ds)/orginal_length}")


train_dataset = ConstantLengthDataset(tokenizer, train_ds, infinite=True, seq_length=effective_seq_length)
valid_dataset = ConstantLengthDataset(tokenizer, valid_ds, infinite=False, seq_length=effective_seq_length)
print(f"Train Dataset Length: {len(train_ds)}")
print(f"Valid Dataset Length: {len(valid_ds)}")

Model Sequence Length: 2048
Effective Sequence Length: 2048


Found cached dataset parquet (C:/Users/Edin/.cache/huggingface/datasets/ammarnasr___parquet/ammarnasr--bigcode-the-stack-dedup-java-small-subset-21491941b0298a53/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

Original Length: 806789
New Length: 790789
Precentage of data used: 0.9801682967913544
Train Dataset Length: 790789
Valid Dataset Length: 897


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)
lora_rank = 64
lora_config = LoraConfig(
    r = lora_rank,
    lora_alpha=lora_rank*2,
    lora_dropout= 0.05,
    bias="all",
    task_type="CAUSAL_LM",
    target_modules = ["qkv_proj", "out_proj", "lm_head", "fc_in", "fc_out"]
)
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
#  %reload_ext tensorboard
%load_ext tensorboard
%tensorboard --logdir codegne-finetuned-LoRa-the-stack-java-v4

In [None]:
training_args_dict = {}
#Add Default Args
training_args_dict.update({
        "output_dir": "codegne-finetuned-LoRa-the-stack-java-v4",
        "run_name": "run-1-LoRa-v4",
        "dataloader_drop_last": True,
        "max_steps": 1000,
        "eval_steps": 50,
        "save_steps": 100,
        "evaluation_strategy": "steps",
        "logging_steps": 1,
        # "push_to_hub": True
})
#Add Optimizer Args
training_args_dict.update({
        # "optim": "adafactor",
        "learning_rate": 5e-5,
        "warmup_steps": 100,
        "lr_scheduler_type": "cosine",
        # "weight_decay": 0.05,
})
# Add Mempry and Speed Args
training_args_dict.update({
        "gradient_checkpointing": True,
        # "gradient_accumulation_steps": 2,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "fp16": True,
})
training_args = TrainingArguments(**training_args_dict)

print('============Default Training Args============')
print(f'Output Dir: {training_args.output_dir}')
print(f'Run Name: {training_args.run_name}')
print(f'Dataloader Drop Last: {training_args.dataloader_drop_last}')
print(f'Max Steps: {training_args.max_steps}')
print(f'Eval Steps: {training_args.eval_steps}')
print(f'Save Steps: {training_args.save_steps}')
print(f'Evaluation Strategy: {training_args.evaluation_strategy}')
print(f'Logging Steps: {training_args.logging_steps}')
print(f'Push To Hub: {training_args.push_to_hub}')

print('============Optimizer Training Args============')
print(f'Optimizer: {training_args.optim}')
print(f'Learning Rate: {training_args.learning_rate}')
print(f'Warmup Steps: {training_args.warmup_steps}')
print(f'LR Scheduler Type: {training_args.lr_scheduler_type}')
print(f'Weight Decay: {training_args.weight_decay}')

print('============Memory and Speed Training Args============')
print(f'Gradient Checkpointing: {training_args.gradient_checkpointing}')
print(f'Gradient Accumulation Steps: {training_args.gradient_accumulation_steps}')
print(f'Per Device Train Batch Size: {training_args.per_device_train_batch_size}')
print(f'Per Device Eval Batch Size: {training_args.per_device_eval_batch_size}')
print(f'FP16: {training_args.fp16}')

In [None]:
trainer = Trainer(model, training_args, train_dataset=train_dataset, eval_dataset=valid_dataset)

In [None]:
trainer.train()

In [None]:
from peft import PeftConfig, PeftModel

checkpoint_number = 100
repo_name = f'./codegne-finetuned-LoRa-the-stack-java-v3/checkpoint-{checkpoint_number}'
config = PeftConfig.from_pretrained(repo_name)
lora_rank = 64
config.r = lora_rank
config.lora_alpha=lora_rank*2
config.lora_dropout= 0.05
config.bias="none"
config.task_type="CAUSAL_LM"



config.target_modules = ["qkv_proj"]
ckpt =  AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, trust_remote_code=True, use_cache=False)
ckpt.enable_input_require_grads()
ckpt = PeftModel.from_pretrained(ckpt, repo_name)
for name, param in ckpt.named_parameters():
  if 'lora' in name:
    param.requires_grad = True
ckpt.print_trainable_parameters()

training_args.run_name = f'run-2-LoRa-v3-checkpoint-{checkpoint_number}'
training_args.warmup_steps = 0

In [None]:
trainer_ckpt = Trainer(ckpt, training_args, train_dataset=train_dataset, eval_dataset=valid_dataset)

In [None]:
trainer_ckpt.train()

#### Push to hub

In [None]:
import os
os.chdir('/content/drive/MyDrive/LLM-for-code-intelligence-Project/LLM-for-code-intelligence/Finetuning')

In [None]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM

repo_name = './codegne-finetuned-LoRa-the-stack-java-v3/checkpoint-800'
config = PeftConfig.from_pretrained(repo_name)
ckpt =  AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, trust_remote_code=True, use_cache=False)
ckpt.enable_input_require_grads()
ckpt = PeftModel.from_pretrained(ckpt, repo_name)
for name, param in ckpt.named_parameters():
  if 'lora' in name:
    param.requires_grad = True
ckpt.print_trainable_parameters()

In [None]:
repo_name = 'codegne-finetuned-LoRa-the-stack-java-v3-checkpoint-800'
ckpt.push_to_hub(repo_name)

In [None]:
#  %reload_ext tensorboard
%load_ext tensorboard
%tensorboard --logdir codegne-finetuned-LoRa-the-stack-java-v2