## Installing Necessary Libraries

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.3 MB/s

# Load the data from csv

In [2]:
import pandas as pd
df_product = pd.read_csv("/content/products.csv")
df_dept = pd.read_csv('/content/departments.csv')

In [3]:
df_joined = pd.merge(df_product, df_dept, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row: row['product_name'] + " ->: " + row['department'], axis = 1)

In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_joined, test_size=0.2, random_state=42)

In [5]:
train_df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,text
7361,10376,Organic Caraway Seeds,104,13,pantry,Organic Caraway Seeds ->: pantry
32534,40131,Original Laundry Detergent,75,17,household,Original Laundry Detergent ->: household
1457,11913,Shelled Pistachios,117,19,snacks,Shelled Pistachios ->: snacks
5201,41392,Harvest Berry Chewy Granola Bars,3,19,snacks,Harvest Berry Chewy Granola Bars ->: snacks
38539,28627,Veganic Sprouted Red Fife Raisin Bran,121,14,breakfast,Veganic Sprouted Red Fife Raisin Bran ->: brea...
18377,29407,Root Vegetable Cakes,42,1,frozen,Root Vegetable Cakes ->: frozen
28627,29571,Alta Dena 1% Milk,84,16,dairy eggs,Alta Dena 1% Milk ->: dairy eggs
30283,3589,Laundry Detergent Pods,75,17,household,Laundry Detergent Pods ->: household
32030,31685,"Daily Shower Cleaner Scrub Free Refill, Fresh ...",114,17,household,"Daily Shower Cleaner Scrub Free Refill, Fresh ..."
28655,30119,Organic French Style Meyer Lemon Yogurt,120,16,dairy eggs,Organic French Style Meyer Lemon Yogurt ->: da...


In [6]:
test_df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,text
33626,24154,Free & Clear Stage 4 Overnight Diapers,56,18,babies,Free & Clear Stage 4 Overnight Diapers ->: babies
18192,27348,"Beef pot roast with roasted potatoes, carrots,...",38,1,frozen,"Beef pot roast with roasted potatoes, carrots,..."
47099,27181,Coffee Liquer,124,5,alcohol,Coffee Liquer ->: alcohol
48183,20577,Bread Rolls,43,3,bakery,Bread Rolls ->: bakery
22197,16472,French Milled Oval Almond Gourmande Soap,25,11,personal care,French Milled Oval Almond Gourmande Soap ->: p...
31573,24121,Dust Pan,114,17,household,Dust Pan ->: household
45362,5477,Roasted Pine Nut Hommus,67,20,deli,Roasted Pine Nut Hommus ->: deli
14131,27921,Cranberry Raspberry Juice Cocktail,98,7,beverages,Cranberry Raspberry Juice Cocktail ->: beverages
26903,4786,Sweet Cream Butter Salted,36,16,dairy eggs,Sweet Cream Butter Salted ->: dairy eggs
39417,8796,Traditional Chicken Barley Soup,69,15,canned goods,Traditional Chicken Barley Soup ->: canned goods


In [7]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

In [8]:
train_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['product_id', 'product_name', 'aisle_id', 'department_id', 'department', 'text', '__index_level_0__'],
        num_rows: 39750
    })
})

## Loading the model

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/981M [00:00<?, ?B/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer will add the EOS token to the end
of the sequence until the desired length is reached, effectively padding the sequence with the EOS token.

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

# LoRA (Low-Rank Adaptation)

what the base model predicts before finetuning ?

In [11]:
import transformers
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)


sequences = pipeline(
   ["“Free & Clear Stage 4 Overnight Diapers” ->:","Bread Rolls ->:","French Milled Oval Almond Gourmande Soap ->:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")



Result: “Free & Clear Stage 4 Overnight Diapers” ->:
 kwietnik
[url=http://www.pills24.com.ua/catalogue/cardio_vascular/]Кардиальные приступы, сердечно-сосудистая патология[/url]
[url=http://www.pills24.com.ua/catalogue/allergic_diseases/]Аллергени[/url]
[url=http://www.pills24.com.ua/catalogue/diagnosis_of_cancer/]Профилактика, диагноз, лечение и профилактика рака в женском организме[/url]
[url=http://www.pills24.com.ua/catalogue/allergic_diseases/]
Result: Bread Rolls ->: 700g
 Hinweis: Die Zutaten sind für 200 bis 250 Rolls geeignet.
Für eine weitere Zubereitung von 1 bis 1,5kg werden 100g weniger Kartoffeln und 150g weniger Butter bzw. Butterzucker und 100g weniger Mehl benötigt.
25g Mehl | 30g Kartoffeln (gemahlen) | 50g Butter | 100g Butterzucker | 135g Kartoffeln (gemahlen) | 130g Mehl | 150g Butter | 95g Butterzucker | 60g Mehl | 150g Butter | 50g Butterzucker
1. Einweichen
Result: French Milled Oval Almond Gourmande Soap ->: https://www. everybody knows that soap is one of the

LoRA (Low-Rank Adaptation), freezes the pre-trained model and adds smaller trainable matrices to each layer.

1.   lora_alpha: This parameter controls the strength of the LORA (Localized Rational Approximation) regularization term in the model. A higher value of alpha will increase the penalty on deviations from the rational approximation, encouraging the model to use simpler functions. The default value is 16.

2.   lora_dropout: This parameter controls the dropout rate for the LORA regularization term. Dropout helps prevent overfitting by randomly setting a fraction of the input units to zero during training. The default value is 0.1, meaning that 10% of the input units will be dropped out at random during training.

3.  target_modules: This list specifies which modules in the transformer architecture should be subject to LORA regularization. In this case, we're targeting the query, key, and value projection layers (q_proj, k_proj, and v_proj), as well as the output projection layer (o_proj).

4. r: This integer represents the number of attention heads that are used in the self-attention mechanism. The default value is 64.

5. bias: This string indicates whether or not to add a bias vector to the output of the attention mechanism. The options are "none" (no bias), "learned" (learn a bias vector), or "fixed" (use a fixed bias vector). The default value is "none".

6.
task_type: This string specifies the type of task the model is designed for. The options are "CAUSAL_LM" (causal language modeling), "NON_CAUSAL_LM" (non-causal language modeling), or "TEXT_CLASSIFICATION" (text classification). The default value is "CAUSAL_LM".


In [17]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [12]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)


## Loading the trainer

1. output_dir: This is the directory where the model's outputs will be saved. This could include things like the model's weights, accuracy, and other metrics.
2. per_device_train_batch_size: This is the batch size that will be used for training the model on each device. A larger batch size can speed up training, but may also cause the model to use more memory.
3. gradient_accumulation_steps: This is the number of steps that gradients will be accumulated before updating the model's weights. Accumulating gradients can help stabilize training and improve convergence.
4. optim: This is the optimization algorithm that will be used to update the model's weights based on the gradients. In this case, paged_adamw_32bit is a variant of Adam optimizer that uses 32-bit floating point numbers to store the model's weights and perform calculations.
6. save_steps: This is the number of steps that the model's weights will be saved at. Saving the model's weights regularly can help prevent losing progress if something goes wrong during training.
7. logging_steps: This is the number of steps that logs will be printed. Logging can help monitor the model's progress and identify any issues that arise during training.
8. learning_rate: This is the rate at which the model will learn from the data. A higher learning rate can cause the model to converge faster, but may also cause it to overshoot the optimal solution.
9. max_grad_norm: This is the maximum norm of the gradients that will be allowed before clipping them. Clipping gradients can help prevent exploding gradients, which can damage the model's weights.
10. max_steps: This is the maximum number of steps that training will run for. If training reaches this limit, it will stop even if the model has not yet converged.
11. warmup_ratio: This is the ratio of training steps that will be used for warmup. During warmup, the model's weights are gradually adjusted to avoid sudden changes that could cause instability.
12. group_by_length: This is a boolean flag indicating whether the model's inputs should be grouped by length. Grouping inputs by length can help improve training efficiency by allowing the model to process inputs of similar lengths together.
13. lr_scheduler_type: This is the type of scheduler that will be used to adjust the model's learning rate during training. In this case, constant indicates that the learning rate will remain constant throughout training. Other schedulers, such as cosine annealing or step learning rate, can be used to adjust the learning rate over time.

In [13]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

Then finally pass everthing to the trainer

In [14]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/39750 [00:00<?, ? examples/s]



pre-process the model by upcasting the layer norms in float 32 for more stable training

In [15]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

In [16]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,5.2467
2,3.0896
3,4.3592
4,6.2712
5,4.2501
6,3.0927
7,4.5178
8,3.9697
9,4.4708
10,5.3574


TrainOutput(global_step=120, training_loss=2.7477568581700327, metrics={'train_runtime': 256.5256, 'train_samples_per_second': 7.485, 'train_steps_per_second': 0.468, 'total_flos': 1224209709760512.0, 'train_loss': 2.7477568581700327, 'epoch': 0.05})

In [17]:
new_model='llama-2-7b-finetuned'
trainer.model.save_pretrained(new_model)

In [18]:
lst_test_data = list(test_df['product_name'])
len(lst_test_data)

9938

In [19]:
sample_size = 3
lst_test_data_short = lst_test_data[:sample_size]
lst_test_data_short

['Free & Clear Stage 4 Overnight Diapers',
 'Beef pot roast with roasted potatoes, carrots, sweet onions, green beans, and a rich gravy Beef Pot Roast',
 'Coffee Liquer']

1. transformers.pipeline(): This function creates a pipeline for a given task, in this case, "text-generation". The pipeline takes in several parameters that configure the model, tokenizer, and other aspects of the text generation process.
2. model: This parameter specifies the model architecture to use for text generation. In this case, it's a pre-trained BERT model.
tokenizer: This parameter specifies the tokenizer to use for converting text inputs into numerical tokens. In this case, it's a pre-trained BERT tokenizer.
3. torch_dtype: This parameter specifies the data type of the PyTorch tensors used in the model. In this case, it's set to torch.float16, which means the model will use 16-bit floating-point numbers for its calculations.
4. trust_remote_code: This parameter specifies whether to trust remote code when executing the model. When set to True, the model will execute the code remotely without verifying its integrity.
5. device_map: This parameter specifies how devices are mapped to GPUs. In this case, it's set to "auto", which means the library will automatically determine the best device mapping based on the available hardware.
6. lst_test_data_short: This parameter is a list of input sequences to feed into the model for generating text. In this case, it's a list of short texts.
7. max_length: This parameter specifies the maximum length of the input sequences. In this case, it's set to 100, meaning the model will accept input sequences up to 100 tokens long.
8. do_sample: This parameter specifies whether to sample the input sequence or not. When set to True, the model will sample the input sequence to generate a random subset of tokens.
9. top_k: This parameter specifies the number of most likely candidates to return for each input sequence. In this case, it's set to 10, meaning the model will return the top 10 most likely candidate tokens for each input sequence.
10. num_return_sequences: This parameter specifies the number of sequences to return for each input sequence. In this case, it's set to 1, meaning the model will return a single sequence for each input sequence.
11. eos_token_id: This parameter specifies the ID of the end-of-sequence token. In this case, it's set to the EOS token ID of the tokenizer.


The pipeline() function returns a generator that yields tuples containing the input sequence and the generated text for each input sequence in lst_test_data_short. The generator iterates through the input sequences and passes them through the model to generate the output text. The do_sample parameter causes the model to sample the input sequence, and the top_k parameter limits the number of candidate tokens returned for each input sequence. The num_return_sequences parameter ensures that a single sequence is returned for each input sequence. Finally, the eos_token_id parameter specifies the ID of the end-of-sequence token to use when generating the output text.

In [20]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.bfloat16,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

sequences = pipeline(
    lst_test_data_short,
    max_length=100,  #200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for ix,seq in enumerate(sequences):
    print(ix,seq[0]['generated_text'])

0 Free & Clear Stage 4 Overnight Diapers ->: health personal care baby toiletries diapers & pads diapers ->: babies toddler baby personal care diapers & diaper creams diapers: pants, swim, training ->: babies toddler baby personal care personal care: pants, swim, training pants ->: babies toddler baby personal care personal care: pants, swim, training
1 Beef pot roast with roasted potatoes, carrots, sweet onions, green beans, and a rich gravy Beef Pot Roast With Potatoes ->: freezer foods frozen meals pasta, rice & potatoes pasta meals ->: frozen meals pasta, rice & potatoes pot pies & casserole meals pot roasts & gravies ->: freezer foods frozen meals pasta, rice
2 Coffee Liquer ->: pantry: baking & dry goods coffee ->: beverages coffee, espresso & tea drinks coffee creme ->: babies items infant feeding ->: infant formula ->: powders infant formula milk ->: baby care & personal care baby care ->: personal care baby skin care babies r us ->: household: baby care & personal care ->: per

In [21]:
def correct_answer(ans):
  return (ans.split("->:")[1]).strip()

answers = []
for ix,seq in enumerate(sequences):
    # print(ix,seq[0]['generated_text'])
    answers.append(correct_answer(seq[0]['generated_text']))

answers

['health personal care baby toiletries diapers & pads diapers',
 'freezer foods frozen meals pasta, rice & potatoes pasta meals',
 'pantry: baking & dry goods coffee']

In [22]:
df_evaluate = test_df.iloc[:sample_size][['product_name','department']]

df_evaluate = df_evaluate.reset_index(drop=True)

df_evaluate['department_predicted'] = answers

df_evaluate

Unnamed: 0,product_name,department,department_predicted
0,Free & Clear Stage 4 Overnight Diapers,babies,health personal care baby toiletries diapers &...
1,"Beef pot roast with roasted potatoes, carrots,...",frozen,"freezer foods frozen meals pasta, rice & potat..."
2,Coffee Liquer,alcohol,pantry: baking & dry goods coffee
