<a href="https://colab.research.google.com/github/ajayrao80/CodellamaATP/blob/main/Codellama_ATP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preparation**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install ndjson

In [None]:
import ndjson
import numpy as np

data_path = "/content/drive/MyDrive/Research/DiF-H/Dataset/processed/proofstep-train.jsonl"

with open(data_path) as f:
  list_data_dict = ndjson.load(f)


np.random.shuffle(list_data_dict)
list_data_dict[0]

In [None]:
data_path_eval = "/content/drive/MyDrive/Research/DiF-H/Dataset/processed/proofstep-val.jsonl"

with open(data_path_eval) as f:
  list_data_dict_eval = ndjson.load(f)

list_data_dict_eval[0]


In [None]:
dataset_train_dict = { "input":[], "output":[] }

for datapoint in list_data_dict:
  dataset_train_dict["input"].append(datapoint["input"])
  dataset_train_dict["output"].append(datapoint["output"])

dataset_eval_dict = { "input":[], "output":[] }

for datapoint in list_data_dict_eval:
  dataset_eval_dict["input"].append(datapoint["input"])
  dataset_eval_dict["output"].append(datapoint["output"])

dataset_train_dict

In [None]:
!pip install git+https://github.com/huggingface/transformers.git@main bitsandbytes accelerate #==0.20.3  # we need latest transformers for this
!pip install git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
#!pip install git+https://github.com/huggingface/peft.git
!pip install datasets==2.10.1
import locale # colab workaround
#locale.getpreferredencoding = lambda x: "UTF-8" # colab workaround
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install wandb

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict(dataset_train_dict)
eval_dataset = Dataset.from_dict(dataset_eval_dict)

In [None]:
train_dataset

In [None]:
eval_dataset

# **Load Model and Train model**

In [None]:
from datetime import datetime
import os
import sys
import torch
from torch.utils.checkpoint import checkpoint
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoConfig,GPTNeoXForCausalLM

In [None]:
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [None]:
def tokenize(prompt, add_eos_token=True, CUTOFF_LEN=2048): #1024
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )

    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# generate and tokenize prompt for Lean dataset
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""
    ### Input:
    {data_point["input"]}

    ### Response:
    {data_point["output"]}{tokenizer.eos_token}
    """
    return tokenize(full_prompt)

In [None]:
# Tokenized dataset
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt, remove_columns=list(train_dataset.features))
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt, remove_columns=list(eval_dataset.features))

In [None]:
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16, #16
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [None]:
resume_from_checkpoint = ""  # For resuming from a checkpoint, insert the .bin file path

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        #adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

In [None]:
# View training graphs

wandb_project = "DiF-Homunculus"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
batch_size = 32
per_device_train_batch_size = 4
gradient_accumulation_steps = 4 #batch_size // per_device_train_batch_size #4
output_dir = "/content/drive/MyDrive/Research/DiF-H/Checkpoints/"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100, #100,
        #max_steps=400,
        num_train_epochs=1,
        learning_rate=3e-4,
        fp16=True,
        #lr_scheduler_type="cosine",
        logging_steps=10,
        optim="adamw_torch", #"paged_adamw_32bit", #"adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=1000, #20,
        save_steps=1000,
        output_dir=output_dir,
        #warmup_ratio=0.03,
        #weight_decay=0.005,
        # save_total_limit=3,
        #load_best_model_at_end=True,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="wandb", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

In [None]:
# Compile the model
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./DiF-Homunculus-v1")

# **Inference with checkpoint model**

In [None]:
from datetime import datetime
import os
import sys
import torch
from torch.utils.checkpoint import checkpoint
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

In [None]:
import ndjson

data_path_test = "/content/drive/MyDrive/Research/DiF-H/Dataset/processed/proofstep-test.jsonl"

with open(data_path_test) as f:
  list_data_dict_test = ndjson.load(f)

list_data_dict_test[1]

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model = "codellama/CodeLlama-7b-hf" # "wellecks/llmstep-mathlib4-pythia2.8b"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=False, #True,
    torch_dtype=torch.float32,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

resume_from_checkpoint = "" # Checkpoint path here

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint, map_location=torch.device('cpu'))
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

In [None]:
eval_prompt = """Prove
### Input:
|- ( ( K e. OML /\\ X e. B ) -> X C X )

### Context:
|- B = ( Base ` K ) \n |- C = ( cm ` K )

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
#with torch.no_grad():
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))