### Downloading dependencies

In [1]:
!pip install -U trl accelerate peft -i https://pypi.org/simple/ bitsandbytes transformers trl huggingface_hub
%pip install -U datasets

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple/
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Loading the Libraries

In [29]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

### Loading the Data

In [20]:
# filename = "/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv"
filename="ft_data.xlsx"

df = pd.read_excel(filename)
df.head()

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
2,Card: disable,Can you put a hold on my card? I can't find it...
3,Card: disable,"Ugh, lost my wallet. Disable my card before so..."
4,Card: disable,"Hey, I think my card's been nicked! Lock it do..."


### Data Pre-Processing

In [21]:
X_train = list()
X_test = list()
for intent in df['intent'].unique():
    train, test  = train_test_split(df[df.intent==intent],
                                    train_size=0.4,test_size=0.3,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train[:1]

[           intent                                              query
 11  Card: disable               I want to block my card immediately.
 3   Card: disable  Ugh, lost my wallet. Disable my card before so...
 18  Card: disable  Can you assist me in pausing my card transacti...
 16  Card: disable   Put a hold on my card, I can't find it anywhere.
 13  Card: disable            How do I go about deactivating my card?
 2   Card: disable  Can you put a hold on my card? I can't find it...
 9   Card: disable  Something's fishy with my card. Shut it down u...
 19  Card: disable     Please deactivate my banking card temporarily.]

In [22]:
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)
X_test

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
17,Card: disable,I need to block my card as I suspect it's been...
15,Card: disable,"I lost my wallet, can you disable my card to p..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
8,Card: disable,Put my card on ice; it's gone AWOL.
...,...,...
877,User Account: change password post login,What's the procedure to update my login passwo...
875,User Account: change password post login,Show me steps to change my user account password.
861,User Account: change password post login,"Need to update my current password, what's the..."
868,User Account: change password post login,Change password feature? Where's that at?


### Evaluation or Validation Data

In [23]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
eval_idx[:5]
X_eval = df[df.index.isin(eval_idx)]
X_eval[:5]

Unnamed: 0,intent,query
0,Card: disable,"Need to block my card ASAP, think it's been co..."
1,Card: disable,"Freeze my card, pretty sure I left it at the b..."
2,Card: disable,Can you put a hold on my card? I can't find it...
3,Card: disable,"Ugh, lost my wallet. Disable my card before so..."
4,Card: disable,"Hey, I think my card's been nicked! Lock it do..."


In [24]:
X_eval = (X_eval
          .groupby('intent', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

  .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))


### loading the model

In [32]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

compute_dtype = getattr(torch, "float16")

# Model Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

# Loading the Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Loading the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
import json

with open('finetuning_data.jsonl', 'r') as json_file:
    json_list = list(json_file)
data=[]
for json_str in json_list:
    data.append(json.loads(json_str))
# print(data[322]["messages"][1]["content"])

### Transform Data into LLM Training Form using Dataset Library 

In [9]:
import pandas as pd
train=pd.DataFrame()

In [24]:
temp=[]
for i in range(len(data)):
   temp.append(f"""<Input> {data[i]['messages'][1]['content']} <Output> {data[i]['messages'][2]['content']}""")

train=pd.DataFrame(temp)

In [39]:
eval_=train[15:35]

In [38]:
train.rename(columns={'Unnamed: 0':'query'}, inplace=True )

#### The pipeline() function from the Hugging Face Transformers library is used to generate text from the language model. The task argument specifies that the task is text generation. The model and tokenizer arguments specify the pre-trained Phi-2 language model and the tokenizer for the language model. The max_new_tokens argument specifies the maximum number of new tokens to generate. The temperature argument controls the randomness of the generated text. A lower temperature will produce more predictable text, while a higher temperature will produce more creative and unexpected text.

In [41]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["query"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 100, #keeping it large no. like 50 was producing unnecessary
                                            #text making it difficult to extract the intent, ON OTHER HAND 
                                            #keeping it small is unable to generate the intents which are bit longer
                        temperature = 0.0,
                       )
        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
        answer = result[0]['generated_text'].lower()
        y_pred.append(answer)
    return y_pred

In [35]:
train.rename{0:"query"}

SyntaxError: invalid syntax (4028962767.py, line 1)

### Let's Fine Tune Now
We configures and initializes a Simple Fine-tuning Trainer (SFTTrainer) for training a large language model using the Parameter-Efficient Fine-Tuning (PEFT) method.
Which should save time as it operates on a reduced number of parameters compared to the model's overall size.
The PEFT method focuses on refining a limited set of (additional) model parameters, while keeping the majority of the pre-trained LLM parameters fixed.
This significantly reduces both computational and storage expenses. Additionally, this strategy addresses the challenge of catastrophic forgetting, which often occurs during the complete fine-tuning of LLMs.

In [43]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=eval_,
    peft_config=peft_config,
    dataset_text_field="query",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=512,
)

trainer.train()

# Save trained model
trainer.model.save_pretrained("mistral_conv_intent_sim-model")

AttributeError: 'DataFrame' object has no attribute 'column_names'

### Predicting the intents for test data

In [14]:
y_pred = predict(X_test, model, tokenizer) 

100%|██████████| 264/264 [01:47<00:00,  2.46it/s]


### Accuracy

In [15]:
y_true_list=list(y_true)

In [16]:
count=0
i=0
for i in range(0,len(y_pred)):
    if y_pred[i].split('is')[-1].strip()==y_true_list[i].lower():
        count+=1
    i+=1
print(count*100/len(y_pred))  

8.712121212121213


### Outcomes
1. when no. of epoch were 2 accuracy was 3.57% , for 5 epoch it is 8.7%.
2. Need to extract the intent efficiently from the output after increasing max_new_token.