In [1]:
print("Hello")

Hello


In [2]:
! pip install transformers peft datasets accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_

In [3]:
from datasets import Dataset , load_dataset

In [4]:
from transformers import (
    AutoModelForCausalLM , AutoTokenizer, TrainingArguments, Trainer,BitsAndBytesConfig,DataCollatorForLanguageModeling
)

In [5]:
from peft import(
    LoraConfig, get_peft_model, PeftModel
)

In [6]:
import pandas as pd
import torch

In [7]:
from huggingface_hub import login

In [8]:
login("")

In [9]:
class LoraFinetunning:
        def __init__(self , model_name ,dataset_name ,  output_dir):
            self.model_name = model_name
            self.dataset_name = dataset_name
            self.output_dir = output_dir
            self.model = None
            self.tokenizer= None
            self.tokenized_data = None

        def load_tokenizer(self):
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name , trust_remote_code=True)
            self.tokenizer.pad_token=self.tokenizer.eos_token

        def load_quantized_model(self):
            bnb_config= BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype = torch.float16,
            )

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name ,
                device_map={"": 0},
                trust_remote_code=True,
                quantization_config=bnb_config
            )

        def apply_lora(self):
            config=LoraConfig(
                r=16,
                lora_alpha=32,
                target_modules=["q_proj", "v_proj"],
                lora_dropout=0.05,
                bias="none",
                task_type="CAUSAL_LM"
            )
            self.model=get_peft_model(self.model,config)


        def load_data(self):
            data=load_dataset(self.dataset_name,'main',split="train")
            data_df = data.to_pandas()

            if "question" in data_df.columns and "answer" in data_df.columns:
                  data_df["text"] = data_df.apply(lambda x: f"question: {x['question']} answer: {x['answer']}", axis=1)
            else:
                  text_column = data_df.columns[0]
                  data_df["text"] = data_df[text_column]

            data = Dataset.from_pandas(data_df)

            def tokenize(sample):
                return self.tokenizer(sample["text"], padding=True, truncation=True, max_length=512)

            self.tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)

        def train(self,epochs: int = 1, batch_size: int = 4, learning_rate: float = 2e-4, max_steps: int = 1000):
            training_args = TrainingArguments(
              output_dir=self.output_dir,  # Model save directory
              per_device_train_batch_size=batch_size,
              gradient_accumulation_steps=1,
              learning_rate=learning_rate,
              lr_scheduler_type="cosine",
              save_strategy="epoch",
              logging_steps=100,
              max_steps=max_steps,
              num_train_epochs=epochs,
              push_to_hub=False,  # Enable uploading
              report_to="none"
          )


            trainer = Trainer(
              model=self.model,
              train_dataset=self.tokenized_data,
              args=training_args,
              data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
            )

            trainer.train()

        def run(self):
            self.load_tokenizer()
            self.load_quantized_model()
            self.apply_lora()
            self.load_data()
            self.train()
            print("model trained")

In [10]:
model_name="microsoft/phi-1_5"
dataset_name="gsm8k"
output_dir="phi-1_5-finetuned"

In [11]:
fine_tuner=LoraFinetunning(model_name,dataset_name,output_dir)

In [12]:
fine_tuner.run()

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,1.1544
200,1.0653
300,1.0334
400,1.0451
500,1.0508
600,1.027
700,0.9836
800,1.0132
900,1.0277
1000,1.0277


model trained


In [13]:
fine_tuner.model.save_pretrained("phi-1_5-finetuned")
fine_tuner.tokenizer.save_pretrained("phi-1_5-finetuned")

('phi-1_5-finetuned/tokenizer_config.json',
 'phi-1_5-finetuned/special_tokens_map.json',
 'phi-1_5-finetuned/vocab.json',
 'phi-1_5-finetuned/merges.txt',
 'phi-1_5-finetuned/added_tokens.json',
 'phi-1_5-finetuned/tokenizer.json')

In [14]:
model = AutoModelForCausalLM.from_pretrained("phi-1_5-finetuned")
tokenizer = AutoTokenizer.from_pretrained("phi-1_5-finetuned")

In [30]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=True

In [20]:
!pip install huggingface_hub


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [23]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import login

# Replace with your actual token from https://huggingface.co/settings/tokens
token 

login(token=token)




In [26]:
from huggingface_hub import whoami

print(whoami())


{'type': 'user', 'id': '675751e35baa5036fd5f3f9b', 'name': 'abhijeetalande12', 'fullname': 'Abhijeet Alande', 'email': 'abhijeetalande12@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/d4d3623a103326471cbf158ebe6c37a3.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'collage', 'role': 'write', 'createdAt': '2025-03-10T13:17:37.631Z'}}}


In [32]:
from huggingface_hub import HfApi

api = HfApi()
repo_id = "abhijeetalande12/phi-1_5-finetuned"  # Your repo name

# Create the repo (set `exist_ok=True` to avoid errors if it already exists)
api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
print(f"Repository {repo_id} created successfully!")


Repository abhijeetalande12/phi-1_5-finetuned created successfully!


In [33]:
api.upload_folder(
    folder_path="phi-1_5-finetuned",  # Change this to your actual model folder path
    repo_id="abhijeetalande12/phi-1_5-finetuned",
    repo_type="model",
    token=token  # Ensure you are logged in
)
print("Model uploaded successfully!")



adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Model uploaded successfully!
