In [None]:
!nvidia-smi

In [None]:
!chmod +x setup.sh
!./setup.sh

In [None]:
project_name = 'LLM-for-code-intelligence-Project'
import os

path= f'/content/drive/MyDrive/{project_name}'

if not os.path.exists(path):
  os.mkdir(path)

os.chdir(path)

repo_name = 'LLM-for-code-intelligence'
repo_path = f'{path}/{repo_name}'
url = f'https://github.com/ammarnasr/{repo_name}.git'



if not os.path.exists(repo_path):
    #clone the repo
    print('Cloning the repo...')
    !git clone $url
else:
    #pull the repo
    print('Pulling the repo...')
    !git -C $repo_name pull

os.chdir(repo_path)

print(f'Current Dir: {os.getcwd()}')

In [None]:
import os
# os.chdir('/content/drive/MyDrive/LLM-for-code-intelligence-Project/LLM-for-code-intelligence/Finetuning')
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from finetuning_datasets import ConstantLengthDataset

In [None]:
model_id = "Salesforce/codegen-350M-mono"
tokenizer_id = "Salesforce/codegen-350M-mono"
dataset_id = "ammarnasr/bigcode-the-stack-dedup-java-small-subset"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)
model_seq_length = model.config.max_position_embeddings
effective_seq_length = model_seq_length//32
print(f"Model Sequence Length: {model_seq_length}")
print(f"Effective Sequence Length: {effective_seq_length}")
dataset = load_dataset(dataset_id)
dataset = dataset['train']
dataset = dataset.train_test_split(test_size=0.0001, shuffle=True)
train_ds = dataset["train"]
valid_ds = dataset["test"]
train_dataset = ConstantLengthDataset(tokenizer, train_ds, infinite=True, seq_length=effective_seq_length)
valid_dataset = ConstantLengthDataset(tokenizer, valid_ds, infinite=False, seq_length=effective_seq_length)
print(f"Train Dataset Length: {len(train_ds)}")
print(f"Valid Dataset Length: {len(valid_ds)}")

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir codegne-finetuned-the-stack-java-v2

In [None]:
training_args_dict = {}
#Add Default Args
training_args_dict.update({
        "output_dir": "codegne-finetuned-the-stack-java-v3",
        "run_name": "run-1-full-v3",
        "dataloader_drop_last": True,
        "max_steps": 500000,
        "eval_steps": 50,
        "save_steps": 100,
        "evaluation_strategy": "steps",
        "logging_steps": 1,
        # "push_to_hub": True
})
#Add Optimizer Args
training_args_dict.update({
        # "optim": "adafactor",
        "learning_rate": 5e-5,
        "warmup_steps": 10,
        "lr_scheduler_type": "cosine",
        "weight_decay": 0.05,
})
# Add Mempry and Speed Args
training_args_dict.update({
        "gradient_checkpointing": True,
        # "gradient_accumulation_steps": 1,
        "per_device_train_batch_size": 1,
        "per_device_eval_batch_size": 1,
        "fp16": True,
})
training_args = TrainingArguments(**training_args_dict)

print('============Default Training Args============')
print(f'Output Dir: {training_args.output_dir}')
print(f'Dataloader Drop Last: {training_args.dataloader_drop_last}')
print(f'Max Steps: {training_args.max_steps}')
print(f'Eval Steps: {training_args.eval_steps}')
print(f'Save Steps: {training_args.save_steps}')
print(f'Evaluation Strategy: {training_args.evaluation_strategy}')
print(f'Logging Steps: {training_args.logging_steps}')
print(f'Push To Hub: {training_args.push_to_hub}')

print('============Optimizer Training Args============')
print(f'Optimizer: {training_args.optim}')
print(f'Learning Rate: {training_args.learning_rate}')
print(f'Warmup Steps: {training_args.warmup_steps}')
print(f'LR Scheduler Type: {training_args.lr_scheduler_type}')
print(f'Weight Decay: {training_args.weight_decay}')

print('============Memory and Speed Training Args============')
print(f'Gradient Checkpointing: {training_args.gradient_checkpointing}')
print(f'Gradient Accumulation Steps: {training_args.gradient_accumulation_steps}')
print(f'Per Device Train Batch Size: {training_args.per_device_train_batch_size}')
print(f'Per Device Eval Batch Size: {training_args.per_device_eval_batch_size}')
print(f'FP16: {training_args.fp16}')

In [None]:
trainer = Trainer(model, training_args, train_dataset=train_dataset, eval_dataset=valid_dataset)

In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint=True)

#### Push to hub

In [None]:
import os
os.chdir('/content/drive/MyDrive/LLM-for-code-intelligence-Project/LLM-for-code-intelligence/Finetuning')
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = 'Salesforce/codegen-350M-mono'
tokenizer = AutoTokenizer.from_pretrained(model_id)

main_dir = 'codegne-finetuned-the-stack-java-v2'
checkpoint = 'checkpoint-200'
ckpt_path = f'{main_dir}/{checkpoint}'
repo_path = f'{main_dir}-{checkpoint}'
model = AutoModelForCausalLM.from_pretrained(ckpt_path, trust_remote_code=True, use_cache=False)

In [None]:
checkpoint = 'checkpoint-800'
repo_path = f'{main_dir}-{checkpoint}'
model.push_to_hub(repo_path)