<a href="https://colab.research.google.com/github/ashioyajotham/Natural-Language-Processing/blob/main/Finetuning/Falcon/Fine_Tuning_Falcon_7B_for_Code_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
from datasets import load_dataset

# Specify the name of the dataset
#dataset_name = "yahma/alpaca-cleaned"
dataset_name = "tatsu-lab/alpaca"


# Load the dataset from the specified name and select the "train" split
dataset = load_dataset(dataset_name, split="train")

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# We will be loading the Falcon 7B model, applying 4bit quantization to it, and then adding LoRA adapters to the model.
import torch

from transformers import FalconForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Defining the name of the Falcon model
model_name = "ybelkada/falcon-7b-sharded-bf16"

# Configuring the BitsAndBytes quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
device_map = 'auto',
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)

# Loading the Falcon model with quantization configuration
model = FalconForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True
)

# Disabling cache usage in the model configuration
model.config.use_cache = False

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading (…)lve/main/config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00008.bin:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00008.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00008.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00008.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00008.bin:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00008.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00008.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00008-of-00008.bin:   0%|          | 0.00/921M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [5]:
# Load the tokenizer for the Falcon 7B model with remote code trust
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [6]:
# Import the necessary module for LoRA configuration
from peft import LoraConfig

# Define the parameters for LoRA configuration
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

# Create the LoRA configuration object
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
]
)

In [7]:
from transformers import TrainingArguments
# Define the directory to save training results
output_dir = "./results"

# Set the batch size per device during training
per_device_train_batch_size = 1

# Number of steps to accumulate gradients before updating the model
gradient_accumulation_steps = 4

# Choose the optimizer type (e.g., "paged_adamw_32bit")
optim = "paged_adamw_32bit"

# Interval to save model checkpoints (every 10 steps)
save_steps = 5

# Interval to log training metrics (every 10 steps)
logging_steps = 5

# Learning rate for optimization
learning_rate = 2e-4

# Maximum gradient norm for gradient clipping
max_grad_norm = 0.3

# Maximum number of training steps
max_steps = 20

# Warmup ratio for learning rate scheduling
warmup_ratio = 0.03

# Type of learning rate scheduler (e.g., "constant")
lr_scheduler_type = "constant"

# Create a TrainingArguments object to configure the training process
training_arguments = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=True,  # Use mixed precision training (16-bit)
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=True,
lr_scheduler_type=lr_scheduler_type,
)

In [8]:
dataset = dataset.map(lambda x: {"text": x["input"]+x["output"]})

# Import the SFTTrainer from the TRL library
from trl import SFTTrainer

# Set the maximum sequence length
max_seq_length = 512

# Create a trainer instance using SFTTrainer
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [9]:
# Iterate through the named modules of the trainer's model
for name, module in trainer.model.named_modules():

# Check if the name contains "norm"
  if "norm" in name:
	  # Convert the module to use torch.float32 data type
	  module = module.to(torch.float32)


trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112626844442275, max=1.0…

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.


Step,Training Loss
10,1.6917
20,1.6445


TrainOutput(global_step=20, training_loss=1.6681296348571777, metrics={'train_runtime': 181.1636, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.11, 'total_flos': 367056368432640.0, 'train_loss': 1.6681296348571777, 'epoch': 0.0})

In [None]:
# Save the model
trainer.save_model("./falcon-7b-code-gen")
#tokenizer.save_pretrained("./tokenizer")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import whoami

whoami()
# you should see something like {'type': 'user',  'id': '...',  'name': 'Wauplin', ...}

In [None]:
from huggingface_hub import create_repo

create_repo(repo_id="super-cool-model")


In [None]:
trainer.push_to_hub("super-cool-model")

In [10]:
import transformers
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
   "Generate a python script to create random numbers between 10 and 100",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Result: Generate a python script to create random numbers between 10 and 100.
Write a loop which generates a random integer between 10 and 100
Store this integer in a variable
Repeat the loop until the loop is completed.
Generate a python script to create random numbers between 10 and 100.
Write a loop which generates a random integer between 10 and 100.
Store this integer in a variable.
Repeat the loop until the loop is completed.
Write a function to generate a random number between 1 and 10.
The function should return an integer value, which represents the random number.
The function should return the random number between 1 and 10.
Repeat the function as many times as needed to generate a random number.

# Write a function to generate a random number between 1 and 10.
# The function should return an integer value, which represents the random number.
# The function should return the random number


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" # the device to load the model onto

#model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
#tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)
#model.to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


<|im_start|>user
What is your favourite condiment?<|im_end|>
<|im_start|>assistant
Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!<|im_end|>
<|im_start|>user
Do you have mayonnaise recipes?<|im_end|>
<|im_start|>assistant
Not right now, but I can certainly point you in the right direction. You can find a fantastic recipe right here: <https://foodnetwork.com/recipes/mayonnaise-recipe/>!<|im_end|>
<|im_start|>user
Oh, I've got it bookmarked. Thank you!<|im_end|>

// function sendIM() {
//    var im_start = 'Hello, user!'
//    var im_end = 'Thank you, assistant!'

//    alert( im_start + im_end )
// }
 
// call sendIM()

// function call sendIM() {
//    alert('Here is the text from the chat:')
//    alert( im_start + im_end )
// }
 
// call sendIM()

// function call sendIM() {
//    var im_start = 'Hello, user!'
// var im_end = 'Thank you, assistant!'
 
//    alert( im_start + im_e

In [12]:
from transformers import pipeline
generator = pipeline('text-generation', model = model, tokenizer=tokenizer)
generator("Hello my name is Ashioya", max_length = 100, max_new_tokens=100, do_sample = True, num_return_sequences=1)
## [{'generated_text': "Hello, I'm a language modeler. So while writing this, when I went out to meet my wife or come home she told me that my"},
##  {'generated_text': "Hello, I'm a language modeler. I write and maintain software in Python. I love to code, and that includes coding things that require writing"}, ...


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Hello my name is - and I would like to talk about -.\nToday I’ll be talking about my favourite game. I enjoy playing football because of all the physical benefits such as improved cardiovascular health and increased muscle strength. It also provides me with the opportunity to exercise for hours every week, which is something I greatly value. In addition, I enjoy playing football because it provides me with an opportunity for social interaction, which is also something important to me. Furthermore, it allows me to learn valuable lessons'}]

In [None]:
import torch
from transformers import AutoTokenizer, FalconForCausalLM

tokenizer = tokenizer
model = model

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
logits = outputs.logits

print(outputs)

CausalLMOutputWithCrossAttentions(loss=tensor(4.7987, grad_fn=<NllLossBackward0>), logits=tensor([[[-33.1717, -31.6326, -33.7443,  ..., -44.9420, -44.7888, -33.9273],
         [-67.7578, -67.2971, -67.5085,  ..., -73.7265, -72.1330, -66.4876],
         [-55.9774, -56.1142, -57.8333,  ..., -59.0230, -61.3613, -57.2609],
         [-51.8628, -54.9432, -57.4470,  ..., -65.7193, -60.5805, -55.8494],
         [-64.3172, -66.3878, -68.7111,  ..., -74.8744, -71.1791, -67.4290],
         [-47.8095, -51.4803, -55.5864,  ..., -65.3664, -62.5505, -52.1583]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.8786,  2.6339,  0.7792,  ..., -1.2441, -0.1573,  1.6261],
          [-1.6237,  2.7957,  1.6042,  ..., -0.9616, -1.8298,  2.1775],
          [-2.0364,  2.3659,  2.6475,  ..., -1.2330, -1.8290,  1.1955],
          [-1.4584,  3.9725,  1.4526,  ..., -1.7670, -2.0023,  1.7966],
          [-2.0888,  2.3305,  2.3091,  ..., -1.2123, -2.8743,  2.2644],
          [-1.6844,  2.4572, 

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"  # the device to load the model onto

model = model
tokenizer = tokenizer

question = "write a python program to check leap year"

# Encode the question
question_input = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate an answer
generated_ids = model.generate(question_input["input_ids"], max_length=100, do_sample=True, pad_token_id=tokenizer.eos_token_id)

answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Question:", question)
print("Answer:", answer)

Question: write a python program to check leap year
Answer: write a python program to check leap year.
import datetime
if __name__ == "__main__":
    date_time = datetime.date.today()
    if date_time in list(range(1, 29, 1)):
        print('valid')
    else:
        print('invalid')

 I am trying to write a python program that prompts the user if a given date is a leap year or not. I will check if the date specified
