### Downloading dependencies

In [None]:
!pip install -U trl accelerate peft -i https://pypi.org/simple/ bitsandbytes transformers trl huggingface_hub
%pip install -U datasets

Looking in indexes: https://pypi.org/simple/
Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.23.2-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

### Loading the Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

### Loading the Data

In [None]:
# filename = "/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv"
filename="ft_data.xlsx"

df = pd.read_excel(filename)
df.head()

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
2,Card: disable,Can you put a hold on my card? I can't find it...
3,Card: disable,"Ugh, lost my wallet. Disable my card before so..."
4,Card: disable,"Hey, I think my card's been nicked! Lock it do..."


### Data Pre-Processing

In [None]:
X_train = list()
X_test = list()
for intent in df['intent'].unique():
    train, test  = train_test_split(df[df.intent==intent],
                                    train_size=0.4,test_size=0.3,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train[:1]

[           intent                                              query
 11  Card: disable               I want to block my card immediately.
 3   Card: disable  Ugh, lost my wallet. Disable my card before so...
 18  Card: disable  Can you assist me in pausing my card transacti...
 16  Card: disable   Put a hold on my card, I can't find it anywhere.
 13  Card: disable            How do I go about deactivating my card?
 2   Card: disable  Can you put a hold on my card? I can't find it...
 9   Card: disable  Something's fishy with my card. Shut it down u...
 19  Card: disable     Please deactivate my banking card temporarily.]

In [None]:
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)
X_test

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
17,Card: disable,I need to block my card as I suspect it's been...
15,Card: disable,"I lost my wallet, can you disable my card to p..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
8,Card: disable,Put my card on ice; it's gone AWOL.
...,...,...
877,User Account: change password post login,What's the procedure to update my login passwo...
875,User Account: change password post login,Show me steps to change my user account password.
861,User Account: change password post login,"Need to update my current password, what's the..."
868,User Account: change password post login,Change password feature? Where's that at?


### Evaluation or Validation Data

In [None]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
eval_idx[:5]
X_eval = df[df.index.isin(eval_idx)]
X_eval[:5]

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
2,Card: disable,Can you put a hold on my card? I can't find it...
3,Card: disable,"Ugh, lost my wallet. Disable my card before so..."
4,Card: disable,"Hey, I think my card's been nicked! Lock it do..."


In [None]:
X_eval = (X_eval
          .groupby('intent', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

  .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))


In [None]:
!pip install huggingface-cli

Collecting huggingface-cli
  Downloading huggingface_cli-0.1-py3-none-any.whl (1.0 kB)
Installing collected packages: huggingface-cli
Successfully installed huggingface-cli-0.1


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your term

### loading the model

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

compute_dtype = getattr(torch, "float16")

# Model Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Loading the Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Loading the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
import json

with open('finetuning_data.jsonl', 'r') as json_file:
    json_list = list(json_file)
data=[]
for json_str in json_list:
    data.append(json.loads(json_str))
# print(data[322]["messages"][1]["content"])

### Transform Data into LLM Training Form using Dataset Library

In [None]:
import pandas as pd
train=pd.DataFrame()

In [None]:
temp=[]
for i in range(len(data)):
   temp.append(f"""<Input> {data[i]['messages'][1]['content']} <Output> {data[i]['messages'][2]['content']}""")

train=pd.DataFrame(temp)

In [None]:
train.rename(columns={'Unnamed: 0':'query'}, inplace=True )

#### The pipeline() function from the Hugging Face Transformers library is used to generate text from the language model. The task argument specifies that the task is text generation. The model and tokenizer arguments specify the pre-trained Phi-2 language model and the tokenizer for the language model. The max_new_tokens argument specifies the maximum number of new tokens to generate. The temperature argument controls the randomness of the generated text. A lower temperature will produce more predictable text, while a higher temperature will produce more creative and unexpected text.

In [None]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["query"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 100, #keeping it large no. like 50 was producing unnecessary
                                            #text making it difficult to extract the intent, ON OTHER HAND
                                            #keeping it small is unable to generate the intents which are bit longer
                        temperature = 0.0,
                       )
        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
        answer = result[0]['generated_text'].lower()
        y_pred.append(answer)
    return y_pred

### Let's Fine Tune Now
We configures and initializes a Simple Fine-tuning Trainer (SFTTrainer) for training a large language model using the Parameter-Efficient Fine-Tuning (PEFT) method.
Which should save time as it operates on a reduced number of parameters compared to the model's overall size.
The PEFT method focuses on refining a limited set of (additional) model parameters, while keeping the majority of the pre-trained LLM parameters fixed.
This significantly reduces both computational and storage expenses. Additionally, this strategy addresses the challenge of catastrophic forgetting, which often occurs during the complete fine-tuning of LLMs.

In [None]:
train_=train[:100]

In [None]:
eval_=train[150:170]

In [None]:
train_

Dataset({
    features: ['0'],
    num_rows: 100
})

In [None]:
from datasets import Dataset
train_=Dataset.from_pandas(train_)
# eval_=Dataset.from_pandas(eval_)

AttributeError: 'Dataset' object has no attribute 'columns'

In [None]:
train

Dataset({
    features: ['0'],
    num_rows: 333
})

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_,
    eval_dataset=eval_,
    peft_config=peft_config,
    dataset_text_field='0',
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=512,
)

trainer.train()

# Save trained model
trainer.model.save_pretrained("mistral_conv_intent_sim-model")



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
0,No log,0.108863
2,0.429500,0.100183
4,0.019100,0.105346




In [None]:
trainer.save_model("Mistral_cov_ft_v0.1")

### Predicting the intents for test data

In [None]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 264/264 [01:47<00:00,  2.46it/s]


### Accuracy

In [None]:
y_true_list=list(y_true)

In [None]:
count=0
i=0
for i in range(0,len(y_pred)):
    if y_pred[i].split('is')[-1].strip()==y_true_list[i].lower():
        count+=1
    i+=1
print(count*100/len(y_pred))

8.712121212121213


### Outcomes
1. when no. of epoch were 2 accuracy was 3.57% , for 5 epoch it is 8.7%.
2. Need to extract the intent efficiently from the output after increasing max_new_token.