<a href="https://colab.research.google.com/github/baptiste-bedouret/Mistral7B-Finetuned/blob/master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Runtime
For fine-tuning Mistral, a GPU instance is essential. Follow the directions below:

1. Go to `Runtime` (located in the top menu bar).
2. Select `Change Runtime Type`.
3. Choose `T4 GPU` (or a comparable option).


## Packages installation

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install transformers accelerate trl torch bitsandbytes peft datasets outlines -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.1/74.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

## Load the dataset

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split

dataset = (load_dataset("json", data_files="/content/drive/My Drive/Smart-Data/Renault/Dataset_Annotated.json",
                        split='train').train_test_split(train_size=3500, test_size=1000))
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tasks', 'completions'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['tasks', 'completions'],
        num_rows: 1000
    })
})


#### Remove the rows when the language is different than english:

In [None]:
def remove(example):
    return example["tasks"]["langue"] != "EN"

# Keep only rows that should not be removed
dataset["train"] = dataset["train"].filter(lambda example, idx: not remove(example), with_indices=True)
dataset["test"] = dataset["test"].filter(lambda example, idx: not remove(example), with_indices=True)

Filter:   0%|          | 0/3500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(len(dataset["train"]))
print(len(dataset["test"]))

1750
485


#### Remove a 300 more rows in train data

In [None]:
random_indices_to_remove = [i for i in range(300)]
dataset["train"] = dataset["train"].filter(lambda example, idx: idx not in random_indices_to_remove, with_indices=True)

Filter:   0%|          | 0/1750 [00:00<?, ? examples/s]

In [None]:
print(len(dataset["train"]))
print(len(dataset["test"]))

1450
485


#### Remove columns from 'tasks':

In [None]:
columns_to_remove = ['id', 'date', 'pays', 'langue', 'score']

def remove_columns_from_tasks(entry):
    for column in columns_to_remove:
        entry['tasks'].pop(column, None)
    return entry

# Apply the function to each entry in the dataset
dataset = dataset.map(remove_columns_from_tasks)

Map:   0%|          | 0/1450 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

#### Examples of texts from train and test:

In [None]:
print(dataset['train']['tasks'][2]['text'])

Very friendly personalised approach to customer needs and requirements.


In [None]:
print(dataset['test']['tasks'][21]['text'])

Overall very pleased 
Didn’t get a reminder my MOT was due though and I forgot until I went to tax the car


#### Remove columns from 'completions':

In [None]:
completions_columns_to_remove = ['intensity', 'span']

def remove_columns_from_completions(entry):
    for completion in entry['completions']:
        for column in completions_columns_to_remove:
            completion.pop(column, None)
    return entry

# Apply the function to each entry in the dataset
dataset = dataset.map(remove_columns_from_completions)

Map:   0%|          | 0/1450 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

#### Examples of categories followed by the sentiment for train and test:

In [None]:
print(dataset['train']['completions'][2])

[{'category': 'Welcome-Kindness-Warmth-Friendliness', 'polarity': 'positive'}]


In [None]:
print(dataset['test']['completions'][21])

[{'category': 'General satisfaction', 'polarity': 'positive'}, {'category': 'Recommendation-intended loyalty', 'polarity': 'positive'}]


#### Load the dataset containing the categories:

In [None]:
category_list = load_dataset("csv", data_files="/content/drive/My Drive/Smart-Data/Renault/Dataset_categories.csv")
print(category_list)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['CONCEPTS NIVEAU 2'],
        num_rows: 72
    })
})


In [None]:
print(category_list['train']['CONCEPTS NIVEAU 2'][:5])

['Welcome-Kindness-Warmth-Friendliness', 'Listenning-Care', 'Attention-Assistance-Effort', 'Correct contact', 'Quality of the relationship']


#### Use of outlines

Outlines provides a powerful domain-specific language to write and manage prompts, via what we call prompt functions. Prompt functions are Python functions that contain a template for the prompt in their docstring, and their arguments correspond to the variables used in the prompt. When called, a prompt function returns the template rendered with the values of the arguments.

In [None]:
# import outlines

# @outlines.prompt
# def formatting_prompts_funco(instructions, categories, review, examples):

#     # Prompt template
#     """<s>[INST]###Instruction:
#     {{ instructions }}[/INST]
#     {% for category in categories %}
#     - {{ category }}
#     {% endfor %}

#     ### Examples
#     {% for example in examples %}
#     - {{ example }}
#     {% endfor %}

#     ### Review
#     {{ review }}
#     ### Answer:
#     <\s>"""

# instructions = " Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the examples. Don't add other comments."
# examples = [
#     [{'category': 'Attention-Assistance-Effort', 'polarity': 'positive'},
#       {'category': 'Quality of the relationship', 'polarity': 'positive'},
#       {'category': 'Attention to detail', 'polarity': 'positive'},
#       {'category': 'Impression of competence', 'polarity': 'positive'}],
#     [{'category': 'Replacement of part', 'polarity': 'positive'},
#       {'category': 'Time taken for work', 'polarity': 'negative'}]
# ]
# # Define the list of reviews
# reviews = [task['text'] for task in dataset['train']['tasks']]

# # Define the list of categories
# categories = category_list['train']['CONCEPTS NIVEAU 2']

# # Define the list of responses
# responses = dataset['train']['completions']

# prompts = []
# model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
# for review in reviews:
#     prompt = formatting_prompts_funco(instructions, categories, review, examples)
#     prompts.append(prompt)

# # Example of prompt template
# # print(prompts[0])
# for i in range(3):
#     answer = outlines.generate.text(model)(prompts[i], max_tokens=100)
#     print(answer)
#     print("\n---------------------------------------------------------------\n")

In [None]:
# print(dataset['train']['completions'][2])

In [None]:
#print(prompts[0])
#print(prompts[1])
#print(prompts[2])

Très bien accueilli et excellente prise en charge.
La personne a permis la révision demandée mais elle a également remarqué que ma voiture était attaquée par des nuisible au niveau de l'isolant du capot moteur. Elle a mis en place un piège pour éviter que cela cause encore plus de dégâts. Grand Merci !
Response:

[{'category': 'Welcome-Kindness-Warmth-Friendliness', 'polarity': 'positive'}, {'category': 'Attention-Assistance-Effort', 'polarity': 'positive'}, {'category': 'Correct contact', 'polarity': 'positive'}, {'category': 'Problem not diagnosed-not resolved', 'polarity': 'neutral'}, {'category': 'Explanation of work carried out', '

#### Without outlines

Create formated prompt:

```
<s>[INST]### Instruction:
Classify this text in one or more of the following categories:
{list of categories}
[/INST]

[INST]### Additional Instruction:
For each of these categories, indicate whether the text is positive, neutral, or negative.
[/INST]

### Input:
{input}

### Response:
{response}</s>
```

In the code below it uses 2-shot inference.

In [None]:
examples = [
    [{'category': 'Attention-Assistance-Effort', 'polarity': 'positive'},
      {'category': 'Quality of the relationship', 'polarity': 'positive'},
      {'category': 'Attention to detail', 'polarity': 'positive'},
      {'category': 'Impression of competence', 'polarity': 'positive'}],
    [{'category': 'Replacement of part', 'polarity': 'positive'},
      {'category': 'Time taken for work', 'polarity': 'negative'}]
]

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['completions'])):
        text = f"""<s>[INST]### Instruction:
  Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the example. Don't add other comments and don't create new categories which are not in the provided ones.
  {category_list['train']['CONCEPTS NIVEAU 2']}
  [/INST]
  ### Example:
  {examples[1]}
  ### Review:
  {example["tasks"][i]['text']}
  ### Answer:
  {example["completions"][i]}
  <\s>"""

        output_texts.append(text)
    return output_texts

In [None]:
formatting_prompts_func(dataset['train'])[0]

"<s>[INST]### Instruction:\n  Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the example. Don't add other comments and don't create new categories which are not in the provided ones.\n  ['Welcome-Kindness-Warmth-Friendliness', 'Listenning-Care', 'Attention-Assistance-Effort', 'Correct contact', 'Quality of the relationship', 'Explanation of work to be done', 'Explanation of work carried out', 'Authorisation before additional work', 'Honesty-Confidence', 'Explanation of invoice', 'Clarity-transmission of information', 'Information regarding the progress of the work', 'Wait for appointment', 'Respect timeframe for work', 'Time taken for work', 'Availability of parts', 'Wait in reception', 'Time dedicated to me', 'Efficiency of the organisation', 'Delivery time', 'Price', 'Value for money', 'Respect of price and promises', 'Refund-Goodwill gesture', 'Part-exchange', 'Do

## Loading and training Mistral 7B model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, platform, warnings
from trl import SFTTrainer
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import numpy as np

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [None]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

(True, True)

Let's example how well the model does at this task currently:

In [None]:
prompt = "[INST]### Instruction:\n Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the example. Don't add other comments and don't create new categories which are not in the provided ones.\n  ['Welcome-Kindness-Warmth-Friendliness', 'Listenning-Care', 'Attention-Assistance-Effort', 'Correct contact', 'Quality of the relationship', 'Explanation of work to be done', 'Explanation of work carried out', 'Authorisation before additional work', 'Honesty-Confidence', 'Explanation of invoice', 'Clarity-transmission of information', 'Information regarding the progress of the work', 'Wait for appointment', 'Respect timeframe for work', 'Time taken for work', 'Availability of parts', 'Wait in reception', 'Time dedicated to me', 'Efficiency of the organisation', 'Delivery time', 'Price', 'Value for money', 'Respect of price and promises', 'Refund-Goodwill gesture', 'Part-exchange', 'Doing 100% what is asked', 'Impression of competence', 'Quality of work carried out', 'Attention to detail', 'Problem not diagnosed-not resolved', 'Return of vehicle', 'Cleanliness-State of vehicle', 'Quality of documents provided', 'Registration', 'Conformity of delivery', 'Availability of desired vehicle', 'Test drive', 'Condition of vehicle on delivery', 'Mobility-Courtesy car', 'Finance', 'Service contract', 'Connected services', 'Warranty', 'My Renault-My Dacia', 'Ease of parking', 'Ease of access-Proximity', 'Opening times', 'Reachability', 'Assistance-Breakdown cover', 'Ease to book appointment', 'Contact after sale', 'Contact after repairs', 'Request to be contacted', 'Manufacturing fault-Breakdown', 'Vehicle performance', 'Accessories', 'Quality of delivery', 'Vehicle handover', 'Comparison with competitors', 'Recommendation-intended loyalty', 'General satisfaction', 'Client fidelity', 'Brand image', 'Loss of customer (ALERTE)', 'Legal risk (ALERTE)', 'Appearance of premises', 'Waiting area-Comfort', 'Showroom', 'Questionnaire comments', 'Pressure for evaluation (ALERTE)', 'No opinion', 'Verbatim not exploitable'][/INST]\n  ### Example:[{'category': 'Replacement of part', 'polarity': 'positive'}, {'category': 'Time taken for work', 'polarity': 'negative'}]\n"
def generate_response1(prompt):
    prompts = []
    for response in dataset['test']['tasks']:
        review = response['text']
        prompt_with_review = f"{prompt}  ### Review:{review}\n  ### Answer:\n  "
        prompts.append(prompt_with_review)

    max_length = 1024
    encoded_input = tokenizer(prompts[0], max_length=max_length, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
    model_inputs = encoded_input.to('cuda')

    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

    decoded_output = tokenizer.batch_decode(generated_ids)

    return decoded_output[0]

In [None]:
generate_response1(prompt)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"<s> [INST]### Instruction:\n Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the example. Don't add other comments and don't create new categories which are not in the provided ones.\n  ['Welcome-Kindness-Warmth-Friendliness', 'Listenning-Care', 'Attention-Assistance-Effort', 'Correct contact', 'Quality of the relationship', 'Explanation of work to be done', 'Explanation of work carried out', 'Authorisation before additional work', 'Honesty-Confidence', 'Explanation of invoice', 'Clarity-transmission of information', 'Information regarding the progress of the work', 'Wait for appointment', 'Respect timeframe for work', 'Time taken for work', 'Availability of parts', 'Wait in reception', 'Time dedicated to me', 'Efficiency of the organisation', 'Delivery time', 'Price', 'Value for money', 'Respect of price and promises', 'Refund-Goodwill gesture', 'Part-exchange', 'Do

In [None]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

All that's left to do is set up a number of hyperparameters.

In [None]:
OUTPUT_DIR = "mistral_instruct_generation"
%load_ext tensorboard
%tensorboard --logdir mistral_instruct_generation/runs

In [None]:
# Training Arguments
# Hyperparameters should be adjusted based on the hardware you using
training_arguments = TrainingArguments(
    output_dir= OUTPUT_DIR,
    report_to = "tensorboard",
    num_train_epochs= 1,
    per_device_train_batch_size= 4,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    formatting_func = formatting_prompts_func,
    train_dataset=dataset['train'],
    eval_dataset = dataset['test'],
    peft_config=peft_config,
    max_seq_length= 2048,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/1420 [00:00<?, ? examples/s]

Map:   0%|          | 0/516 [00:00<?, ? examples/s]



Train the dataset on Mistral model:

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
30,0.5598
60,0.2267


Step,Training Loss
30,0.5598
60,0.2267
90,0.1464
120,0.1771
150,0.184


TrainOutput(global_step=177, training_loss=0.23722602149187508, metrics={'train_runtime': 20235.2156, 'train_samples_per_second': 0.07, 'train_steps_per_second': 0.009, 'total_flos': 4.321259054707507e+16, 'train_loss': 0.23722602149187508, 'epoch': 1.0})

## Evaluation process

In [None]:
trainer.save_model("mistral_instruct_generation")

In [None]:
merged_model = model.merge_and_unload()



In [None]:
prompt = "[INST]### Instruction:\n Classify the following review in one or more of the following categories. Indicate the polarity: positive, negative or neutral. Give an answer in the same format as in the example. Don't add other comments and don't create new categories which are not in the provided ones.\n  ['Welcome-Kindness-Warmth-Friendliness', 'Listenning-Care', 'Attention-Assistance-Effort', 'Correct contact', 'Quality of the relationship', 'Explanation of work to be done', 'Explanation of work carried out', 'Authorisation before additional work', 'Honesty-Confidence', 'Explanation of invoice', 'Clarity-transmission of information', 'Information regarding the progress of the work', 'Wait for appointment', 'Respect timeframe for work', 'Time taken for work', 'Availability of parts', 'Wait in reception', 'Time dedicated to me', 'Efficiency of the organisation', 'Delivery time', 'Price', 'Value for money', 'Respect of price and promises', 'Refund-Goodwill gesture', 'Part-exchange', 'Doing 100% what is asked', 'Impression of competence', 'Quality of work carried out', 'Attention to detail', 'Problem not diagnosed-not resolved', 'Return of vehicle', 'Cleanliness-State of vehicle', 'Quality of documents provided', 'Registration', 'Conformity of delivery', 'Availability of desired vehicle', 'Test drive', 'Condition of vehicle on delivery', 'Mobility-Courtesy car', 'Finance', 'Service contract', 'Connected services', 'Warranty', 'My Renault-My Dacia', 'Ease of parking', 'Ease of access-Proximity', 'Opening times', 'Reachability', 'Assistance-Breakdown cover', 'Ease to book appointment', 'Contact after sale', 'Contact after repairs', 'Request to be contacted', 'Manufacturing fault-Breakdown', 'Vehicle performance', 'Accessories', 'Quality of delivery', 'Vehicle handover', 'Comparison with competitors', 'Recommendation-intended loyalty', 'General satisfaction', 'Client fidelity', 'Brand image', 'Loss of customer (ALERTE)', 'Legal risk (ALERTE)', 'Appearance of premises', 'Waiting area-Comfort', 'Showroom', 'Questionnaire comments', 'Pressure for evaluation (ALERTE)', 'No opinion', 'Verbatim not exploitable'][/INST]\n  ### Example:[{'category': 'Replacement of part', 'polarity': 'positive'}, {'category': 'Time taken for work', 'polarity': 'negative'}]\n"
def generate_prompt(prompt):
    prompts = []
    for response in dataset['test']['tasks']:
        review = response['text']
        prompt_with_review = f"{prompt}  ### Review:{review}\n  ### Answer:\n  </s>"
        prompts.append(prompt_with_review)
    return prompts

def generate_response2(prompt):
    prompt_reponses = []
    max_length = 1024
    for i in tqdm(range(3)):# range(len(dataset['test']['tasks']))
        encoded_input = tokenizer(prompt[i], max_length=max_length, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
        model_inputs = encoded_input.to('cuda')
        generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
        decoded_output = tokenizer.batch_decode(generated_ids)
        prompt_reponses.append(decoded_output[0])
    return prompt_reponses

In [None]:
data = generate_response2(generate_prompt(prompt))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 33%|███▎      | 1/3 [26:59<53:58, 1619.22s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
data[:3]

NameError: name 'data' is not defined

In [None]:
import json
import re

prediction_list = []

for string in data:
    # Find the index of the first occurrence of '###'
    answer_index = string.find('### Answer')

    # If '### Answer' is found, extract the substring after it
    if answer_index != -1:
        answer_substr = string[answer_index + 11:]
        print("answer_substr:\n", answer_substr)

        opening_bracket_index = answer_substr.find('[')
        closing_bracket_index = answer_substr.find(']')
        opening_bracket_index2 = answer_substr.find('[{')
        closing_bracket_index2 = answer_substr.find('}]')

        # If both brackets are found, extract the substring between them
        if opening_bracket_index2 != -1 and closing_bracket_index2 != -1:
            json_data = answer_substr[opening_bracket_index2:closing_bracket_index2 + 2]  # +1
            print("json_data:\n", json_data)

            # Parse the substring as a list of dictionaries
            try:
                data_list = eval(json_data)
                print("data_list:\n", data_list)

                # Extract the 'category' field from each dictionary
                categories = [entry.get('category') for entry in data_list if entry.get('category')]
                prediction_list.append(categories)

            except Exception as e:
                #print(f"Error processing string: {string}, Error: {e}")
                print(f"Error: {e}")
                prediction_list.append([])
        else:
            # Add an empty list to prediction_list
            prediction_list.append([])
    else:
        # Add an empty list to prediction_list if '### Answer' is not found
        prediction_list.append([])
print("\n------------------------\n")
print(prediction_list[:3])

In [None]:
# print(dataset['test']['tasks'][0]['text'])
#print(dataset['test']['completions'][0])

In [None]:
true_label_list = []
ground_truth = dataset['test']['completions']

for true_labels in ground_truth:
    true_labels_review = [label['category'] for label in true_labels]
    true_label_list.append(true_labels_review)
print(true_label_list)

print(len(true_label_list))
print(len(prediction_list))

accuracies = []
total_accuracy = 0
# Iterate through each pair of true labels and predictions
for true_labels, predictions in zip(true_label_list, prediction_list):
    correct_predictions = sum(label in predictions for label in true_labels)
    # total_correct_predictions += correct_predictions

    if(len(true_labels) < len(predictions)):
        accuracy = correct_predictions / len(predictions) if len(predictions) > 0 else 0
        total_accuracy+=len(predictions)
    else:
        accuracy = correct_predictions / len(true_labels) if len(true_labels) > 0 else 0
        total_accuracy+=len(true_labels)
    accuracies.append(accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracies) / total_accuracy if total_accuracy > 0 else 0

print("Accuracy for each sublist:", accuracies)
print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")

In [None]:
# 2nd method
accuracies = []
total_correct_predictions = 0
total_predictions = 0
total_labels = 0  # To keep track of the total number of true labels

# Iterate through each pair of true labels and predictions
for true_labels, predictions in zip(true_label_list, prediction_list):
    correct_predictions = sum(label in predictions for label in true_labels)

    # Accumulate total correct predictions
    total_correct_predictions += correct_predictions

    # Accumulate total predictions
    total_predictions += len(predictions)

    # Accumulate total true labels
    total_labels += len(true_labels)

# Calculate overall accuracy
overall_accuracy = total_correct_predictions / max(total_predictions, total_labels) if max(total_predictions, total_labels) > 0 else 0

print("Total Correct Predictions:", total_correct_predictions)
print("Total Predictions:", total_predictions)
print("Total True Labels:", total_labels)
print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")

In [None]:
le = ['Welcome-Kindness-Warmth-Friendliness']

In [None]:
# Create a dictionary to store confusion matrices for each category
confusion_matrices = {}

# Iterate through each category
for category in le: # category_list['train']['CONCEPTS NIVEAU 2']:
    # Initialize an empty confusion matrix for the category
    category_matrix = np.zeros((2, 2), dtype=int)

    # Iterate through each pair of true labels and predictions
    for true_labels, predictions in zip(true_label_list, prediction_list):
        for t in true_labels:
            for p in predictions:
                true_positive = (t == category) and (p == category) # model predict: positive class, true label: positive class
                true_negative = (t != category) and (p != category) # model predict: negative class, true label: negative class
                print(true_positive)
                #false_positive = (t != category) and (p == category) # model predict: positive class, true label: negative class
                #false_negative = (t == category) and (p != category) # model predict: negative class, true label: positive class

                category_matrix[0, 0] += true_positive
                # category_matrix[0, 1] += false_negative
                # category_matrix[1, 0] += false_positive
                category_matrix[1, 1] += true_negative


    # Store the confusion matrix for the category in the dictionary
    confusion_matrices[category] = category_matrix

# Display confusion matrices for each category
for category, matrix in confusion_matrices.items():
    print(f"\nConfusion Matrix for Category '{category}':")
    print(matrix)

Build a confusion matrix for each review in the test set:

In [None]:
confusion_matrices = []
category_matrix = np.zeros((2, 2), dtype=int)
# Iterate through each pair of true labels and predictions
for true_labels, predictions in zip(true_label_list, prediction_list):
    # Create a confusion matrix for the current sublist
    confusion_matrix_result = confusion_matrix(true_labels, predictions)

    for t in true_labels:
        for p in predictions:
            #if(t == p):
            true_positive = (t == p) # model predict: positive class, true label: positive class
            false_negative = (t != p) # model predict: negative class, true label: positive class
        category_matrix[1, 1] += true_positive
        category_matrix[0, 0] += false_negative
        confusion_matrices.append(category_matrix)

    # Display confusion matrix for the current sublist
    print("\nConfusion Matrix:")
    print(confusion_matrix_result)
    print(confusion_matrices)