In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U accelerate
!pip install -q -U transformers

In [34]:
import torch
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import re
import os
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

In [None]:
sns.set_style("whitegrid")

In [None]:
# Check if cuda is available
assert torch.cuda.is_available(), "Cuda is not available, you cannot use the quantized version of the Mistral model without a GPU!"

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"All calculations will be performed on {device}")

All calculations will be performed on cuda


## Load Data

In [None]:
# Run this cell if you are using Google Colaboratory

# from google.colab import drive
# drive.mount('/content/drive')

# BASE_DIR = Path(f"/content/drive/MyDrive/{your_path}")
# DATA_DIR = BASE_DIR.joinpath("data")  # Please upload the data_quality_estim.csv dataset to the data folder

# FILE_NAME = "data_quality_estim_4classes.csv"

# FILE_QUAL = DATA_DIR.joinpath(FILE_NAME)

Mounted at /content/drive


In [None]:
BASE_DIR = Path('Coherence_of_Words_4classes_Mistral_7B_Instruct_v20_4_bit.ipynb').resolve().parent.parent

DATA_DIR_NAME = 'data/datasets'  # Local directory

DATA_DIR = BASE_DIR.joinpath(DATA_DIR_NAME)
FILE_NAME_QUAL = 'data_quality_estim_4classes.csv'

FILE_QUAL = DATA_DIR.joinpath(FILE_NAME_QUAL)

In [None]:
data_quality = pd.read_csv(FILE_QUAL)
data_quality.head(10)

Unnamed: 0,task,agg_label
0,able amaze fill glad stop daily fantastic move...,rather_bad
1,able amaze fill stop glad daily fantastic move...,rather_bad
2,able amaze fill stop glad daily fantastic move...,rather_bad
3,able amaze glad fill stop fantastic daily move...,bad
4,absolute church truth doug scripture symbol li...,rather_good
5,absolute church truth mission nasa doug mars e...,rather_bad
6,absolute mission church nasa mars earth truth ...,good
7,absolutely fast white rice service noodle terr...,rather_bad
8,absolutely fast white super service customer r...,good
9,absolutely grain experience soon there's proce...,bad


In [None]:
print("The number of samples:", data_quality.shape[0])

The number of samples: 3861


## Load Model

In [None]:
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# Instruct-v0.2 Out of memory on inference Tesla T4
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Instruct-v0.2 4-bit quantization
# Qunatization config
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# Load model
model_4bit = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=quantization_config_4bit,
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

## Pipeline of the Inference

In [None]:
def generate_answer(prompt, model, device, temp, n_token):
    torch.cuda.empty_cache()
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    if device=='cpu':
        model.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [None]:
def extract_substring(input_string):
    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [None]:
def get_answer(
        task,
        model,
        device='cpu',
        temp=0.1,
        n_token=500,
        instruction='',
        sample='',
        print_system_prompt=False,
        save_system_prompt=False
        ):

    # Construct prompt
    if task == "classification":
        prompt = f"""<s> [INST] {instruction}\n{sample} [/INST]"""
    if print_system_prompt:
        print(prompt)
    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    if save_system_prompt:
        result = answer[0]
    else:
        result = extract_substring(answer[0])
    return result.replace("\n", " ")

In [44]:
def get_prompt(prompt_type, sample):
    if prompt_type == "APM4":
        system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                            "You will recieve a list of words. Please rate the following words on a scale from 0 to 3 by answering "
                            "on the question: 'Is it possible to determine a common topic for the presented word set or at least "
                            "for the most part of the set?'. Use the criteria: '0' when words seem to be unconnected, "
                            "'1' if the amount of irrelevant words is high to determine a topic or there is a mixture "
                            "of topics, '2' if some words are too common or out of topic, '3' if words have a strong "
                            "connection between them. Print the answer in the form: Answer: [rating]. Print only answer without explanation.\n")
        sample = 'Words: ' + sample + "\n"
    elif prompt_type == "P1":
        system_prompt = ("You are a useful assistant who evaluates the coherence of words. "
                 "You will receive a list of words, please determine which class the given "
                 "list of words belongs to by answering the question: 'Is it possible to determine "
                 "a common topic for the presented word set or at least for the most part of the set?'.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected.\nPrint only class without explanation and additional information.\n")

        sample = 'Words: ' + sample + "\nClass:"

    elif prompt_type == "P2":
        system_prompt = ("You are a useful assistant who evaluates the coherence of words. "
                 "You will receive a list of words, please determine which class the given "
                 "list of words belongs to. Print only class without explanation and additional information.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected.\n")

        sample = 'Words: ' + sample + "\nClass:"

    return system_prompt, sample

In [None]:
def get_results(data, model, task, prompt_type, filename, num_iter, device='cpu', temp=0.1, n_token=500,
                shuffle_data=False, remove_old_file=True, print_info=False):
    # Create file
    # data_dir = BASE_DIR.joinpath('matveeva/results')
    data_dir = BASE_DIR.joinpath('data/results')
    res_file = data_dir.joinpath(filename)

    # Remove old file
    if remove_old_file:
        if os.path.exists(res_file):
            os.remove(res_file)

    # Shuffle data
    if shuffle_data:
        data = data.sample(frac=1, random_state=42)

    for i in tqdm(range(num_iter)):
        sample = data.iloc[i]["task"]
        if task == "classification":
            system_prompt, sample = get_prompt(prompt_type, sample)

            res_true = data.iloc[i]["agg_label"]
            res_model = get_answer(
                task=task,
                model=model,
                device=device,
                temp=temp,
                n_token=n_token,
                instruction=system_prompt,
                sample=sample,
                print_system_prompt=print_info
            )
            if print_info:
                print(res_model)

            with open(res_file, 'a') as file:
                file.write(f"{data.iloc[i]['task']}\t{res_true}\t{res_model}\n")

## Task 1. Evaluating the Coherence of Provided Words (Classification Task)

## Prompt Engineering

### Article ptompt (4 classes)

Source: https://aclanthology.org/2023.emnlp-main.581.pdf

In [None]:
system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                 "Please rate how related the following words are to each other on a scale from 0 to 3 "
                 "('0' = bad, '1' = rather bad, '2' = rather good, '3' = good). Reply with a single number, indicating the overall appropriateness of the topic.")

sample = data_quality.iloc[0]["task"]

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=500,
    print_system_prompt=True
)

<s> [INST] You are a helpful assistant evaluating the top words of a topic model output for a given topic. Please rate how related the following words are to each other on a scale from 0 to 3 ('0' = bad, '1' = rather bad, '2' = rather good, '3' = good). Reply with a single number, indicating the overall appropriateness of the topic.
able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter [/INST]


' I would rate the relatedness of these words to each other as a 1. The words "daily," "job," and "strongly" suggest a work-related or routine topic. "Amaze," "fantastic," "prime," and "move" suggest excitement or change. "Ability," "glad," "stop," "litter," and "subtle" don\'t fit clearly into either category. Overall, the topic seems rather disconnected, as there doesn\'t appear to be a clear theme or cohesive meaning.</s>'

**Conclusion:** The model does not understand what is required of her

### Article prompt modification 1

In [None]:
system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                 "Please answer on a question: 'Is it possible to determine a common topic for the presented wordset?"
                 "Rate the answer on a scale from 0 to 3 ('0' = bad,  '1' = rather bad, '2' = rather good, '3' = good).\n"
                 "Reply with only a single number.")

sample = data_quality.iloc[0]["task"]

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=500,
    print_system_prompt=True
)

<s> [INST] You are a helpful assistant evaluating the top words of a topic model output for a given topic. Please answer on a question: 'Is it possible to determine a common topic for the presented wordset?Rate the answer on a scale from 0 to 3 ('0' = bad,  '1' = rather bad, '2' = rather good, '3' = good).
Reply with only a single number.
able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter [/INST]


' 2. The wordset contains a mix of words that could potentially relate to topics such as ability, emotions, and work. However, the connection between some words (such as "litter" and "prime") is not immediately clear, which lowers the rating slightly. Overall, the wordset shows some signs of a coherent topic.</s>'

**Conclusion:** The model understands what is required of it, in addition to the answer, an explanation is also formed

### Article prompt modification 2

In [None]:
system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                 "You will get the WORDS, please rate how strongly are these words consistent and is it possible to formulate "
                 "a common topic for them ('0' = bad consistency,  '1' = rather bad consistency, '2' = rather good consistency, '3' = good consistency).\n"
                 "Reply with only a single number without the description of your answer")

sample = 'WORDS: ' + data_quality.iloc[0]["task"]

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=500,
    print_system_prompt=True
)


<s> [INST] You are a helpful assistant evaluating the top words of a topic model output for a given topic. You will get the WORDS, please rate how strongly are these words consistent and is it possible to formulate a common topic for them ('0' = bad consistency,  '1' = rather bad consistency, '2' = rather good consistency, '3' = good consistency).
Reply with only a single number without the description of your answer
WORDS: able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter [/INST]


' 1. I would rate the consistency of the given words as rather bad. The words "able," "amazed," "daily," "fill," "glad," "job," "strongly," and "litter" do not have a clear connection to each other and do not formulate a common topic. The words "amazed," "glad," and "strongly" suggest positive emotions, while "able," "fill," "job," and "litter" have different meanings and connotations. The words "daily," "move," and "prime" do not fit well with the other words and could be considered outliers. "Fantastic" and "subtle" could potentially be related to the topic, but they do not provide enough clarity to define a clear topic.</s>'

**Conclusion:** The model provides an answer in the appropriate form, but also forms an explanation, which significantly slows down the inference on the entire dataset

### Article prompt modification 3

In [None]:
system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                 "You will recieve a list of words. Please rate the following words on a scale from 0 to 3 by answering "
                 "on the question: 'Is it possible to determine a common topic for the presented word set or at least "
                 "for the most part of the set?'. Use the criteria: '0' when words seem to be unconnected, "
                 "'1' if the amount of irrelevant words is high to determine a topic or there is a mixture "
                 "of topics, '2' if some words are too common or out of topic, '3' if words have a strong "
                 "connection between them. Print the answer in the form: Answer: [rating]. Print only answer and nothing more.\n")

sample = 'Words: ' + data_quality.iloc[0]["task"] + "\n"

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=500,
    print_system_prompt=True
)


<s> [INST] You are a helpful assistant evaluating the top words of a topic model output for a given topic. You will recieve a list of words. Please rate the following words on a scale from 0 to 3 by answering on the question: 'Is it possible to determine a common topic for the presented word set or at least for the most part of the set?'. Use the criteria: '0' when words seem to be unconnected, '1' if the amount of irrelevant words is high to determine a topic or there is a mixture of topics, '2' if some words are too common or out of topic, '3' if words have a strong connection between them. Print the answer in the form: Answer: [rating]. Print only answer and nothing more.

Words: able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter
 [/INST]


' Answer: 1.  Explanation: The word set "able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter" contains a mixture of topics. Some words like "daily," "job," and "able" suggest a topic related to work or routine, while other words like "amazed," "fantastic," and "amaze" suggest a topic related to emotions or excitement. Additionally, words like "stop," "litter," and "empty" seem out of place and could be irrelevant to either topic. Overall, the set contains a significant amount of irrelevant words and a mixture of topics, making it difficult to determine a clear topic for the majority of the words. Therefore, I would rate this set a 1 on the given scale.</s>'

**Conclusion:** The model provides an answer in the appropriate form, but also forms an explanation, which significantly slows down the inference on the entire dataset

### Article prompt modification 4

In [None]:
system_prompt = ("You are a helpful assistant evaluating the top words of a topic model output for a given topic. "
                 "You will recieve a list of words. Please rate the following words on a scale from 0 to 3 by answering "
                 "on the question: 'Is it possible to determine a common topic for the presented word set or at least "
                 "for the most part of the set?'. Use the criteria: '0' when words seem to be unconnected, "
                 "'1' if the amount of irrelevant words is high to determine a topic or there is a mixture "
                 "of topics, '2' if some words are too common or out of topic, '3' if words have a strong "
                 "connection between them. Print the answer in the form: Answer: [rating]. Print only answer without explanation.\n")

sample = 'Words: ' + data_quality.iloc[0]["task"] + "\n"

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=500,
    print_system_prompt=True
)

<s> [INST] You are a helpful assistant evaluating the top words of a topic model output for a given topic. You will recieve a list of words. Please rate the following words on a scale from 0 to 3 by answering on the question: 'Is it possible to determine a common topic for the presented word set or at least for the most part of the set?'. Use the criteria: '0' when words seem to be unconnected, '1' if the amount of irrelevant words is high to determine a topic or there is a mixture of topics, '2' if some words are too common or out of topic, '3' if words have a strong connection between them. Print the answer in the form: Answer: [rating]. Print only answer without explanation.

Words: able amaze fill glad stop daily fantastic move prime plenty empty subtle job strongly litter
 [/INST]


' Answer: 1.</s>'

**Conclusion:** The model gives the answer in the required form, the inference is fast. This product can be used in further research

### Prompt 1. Chain-of-Thought

In [None]:
system_prompt = ("You are a useful assistant who evaluates the coherence of words. "
                 "You will receive a list of words, please determine which class the given "
                 "list of words belongs to by answering the question: 'Is it possible to determine "
                 "a common topic for the presented word set or at least for the most part of the set?'.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected.\nPrint only class without explanation and additional information.\n")

sample = 'Words: ' + data_quality.iloc[10]["task"] + "\nClass:"

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=10,
    print_system_prompt=True
)

<s> [INST] You are a useful assistant who evaluates the coherence of words. You will receive a list of words, please determine which class the given list of words belongs to by answering the question: 'Is it possible to determine a common topic for the presented word set or at least for the most part of the set?'.
Classification rules:
yes - if words have a strong connection between them
rather yes - if some words are too common or out of topic
rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics
no - when words seem to be unconnected.
Print only class without explanation and additional information.

Words: absolutely grain experience soon there's process true anyway knew terrible sort satisfied average replace wild
Class: [/INST]


' rather yes. Some words like "grain,"'

**Conclusion:** The system prompt is formulated differently. Also, in this case, the class is generated verbally.

### Prompt 2. Chain-of-Thought

In [None]:
system_prompt = ("You are a useful assistant who evaluates the coherence of words. "
                 "You will receive a list of words, please determine which class the given "
                 "list of words belongs to. Print only class without explanation and additional information.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected.\n")

sample = 'Words: ' + data_quality.iloc[10]["task"] + "\nClass:"

get_answer(
    task="classification",
    model=model_4bit,
    device=device,
    instruction=system_prompt,
    sample=sample,
    n_token=10,
    print_system_prompt=True
)

<s> [INST] You are a useful assistant who evaluates the coherence of words. You will receive a list of words, please determine which class the given list of words belongs to. Print only class without explanation and additional information.
Classification rules:
yes - if words have a strong connection between them
rather yes - if some words are too common or out of topic
rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics
no - when words seem to be unconnected.

Words: absolutely grain experience soon there's process true anyway knew terrible sort satisfied average replace wild
Class: [/INST]


' rather no.</s>'

**Conclusion:** Compared to the previous prompt, the question that the model should answer was removed, leaving only classes and an explanation for them. It is also clear that the model responds differently to the same sample

## Inference on the Entire Dataset

### Article prompt modification 4

In [None]:
RES_FILE_NAME = "text_coherence_mistral_7b_4_bit_4classes_APM4.csv"

get_results(
    data=data_quality,
    model=model_4bit,
    task='classification',
    prompt_type='APM4',
    filename=RES_FILE_NAME,
    num_iter=data_quality.shape[0],
    device=device,
    temp=0.1,
    n_token=500,
    shuffle_data=False,
    remove_old_file=True,
    print_info=False
)

100%|██████████| 71/71 [12:35<00:00, 10.64s/it]


### Prompt I

In [None]:
RES_FILE_NAME = "text_coherence_mistral_7b_4_bit_4classes_P1.csv"

get_results(
    data=data_quality,
    model=model_4bit,
    task='classification',
    prompt_type='P1',
    filename=RES_FILE_NAME,
    num_iter=data_quality.shape[0],
    device=device,
    temp=0.1,
    n_token=10,
    shuffle_data=False,
    remove_old_file=False,
    print_info=False
)

100%|██████████| 341/341 [12:54<00:00,  2.27s/it]


### Prompt II

In [None]:
RES_FILE_NAME = "text_coherence_mistral_7b_4_bit_4classes_P2.csv"

get_results(
    data=data_quality,
    model=model_4bit,
    task='classification',
    prompt_type='P2',
    filename=RES_FILE_NAME,
    num_iter=data_quality.shape[0],
    device=device,
    temp=0.1,
    n_token=10,
    shuffle_data=False,
    remove_old_file=True,
    print_info=False
)

100%|██████████| 3861/3861 [2:05:47<00:00,  1.95s/it]
