In [1]:
!pip3 install transformers>=4.32.0 optimum>=1.12.0
!pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu118/
Collecting auto-gptq
  Downloading https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.7.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.1.1-py3-none-any.whl (13.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
Collecting peft>=0.5.0 (from auto-gptq)
  Downloading peft-

In [4]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import os
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

## Read Data

In [6]:
# Run this cell if you are using Google Colaboratory

from google.colab import drive
drive.mount('/content/drive')

your_path = "ITMO/RW'23"

BASE_DIR = Path(f"/content/drive/MyDrive/{your_path}")
DATA_DIR = BASE_DIR.joinpath("data")  # Please upload the data_quality_estim.csv dataset to the data folder

FILE_NAME = "words_consistensy_wordset_dataset_agg.csv"

FILE = DATA_DIR.joinpath(FILE_NAME)

Mounted at /content/drive


In [None]:
BASE_DIR = Path('Coherence_of_Words_2classes_Llama_13B_Chat_4bit.ipynb').resolve().parent.parent

DATA_DIR_NAME = 'data/datasets'  # Local directory

DATA_DIR = BASE_DIR.joinpath(DATA_DIR_NAME)
FILE_NAME_QUAL = 'data_quality_estim.csv'

FILE = DATA_DIR.joinpath(FILE_NAME_QUAL)

In [7]:
data = pd.read_csv(FILE)
data.head(10)

Unnamed: 0,INPUT:wordset,INPUT:dataset_name,OUTPUT:quality,homogenity,assessment
0,able amaze fill glad stop daily fantastic move...,amazon_food,"['rather_bad', 'bad', 'rather_bad']",0.7,-1
1,able amaze fill stop glad daily fantastic move...,amazon_food,"['bad', 'rather_good', 'rather_bad', 'rather_b...",0.5,-1
2,able amaze fill stop glad daily fantastic move...,amazon_food,"['bad', 'rather_bad', 'rather_bad']",0.7,-1
3,able amaze glad fill stop fantastic daily move...,amazon_food,"['bad', 'bad', 'bad']",1.0,-2
4,absolute church truth doug scripture symbol li...,20newsgroups,"['rather_good', 'bad', 'bad', 'rather_good', '...",0.6,1
5,absolute church truth mission nasa doug mars e...,20newsgroups,"['rather_bad', 'rather_bad', 'bad', 'rather_ba...",0.8,-1
6,absolute mission church nasa mars earth truth ...,20newsgroups,"['good', 'good', 'bad', 'good', 'good']",0.8,2
7,absolutely fast white rice service noodle terr...,amazon_food,"['rather_bad', 'bad', 'rather_bad', 'good', 'b...",0.5,-1
8,absolutely fast white super service customer r...,amazon_food,"['bad', 'good', 'rather_good', 'rather_good', ...",0.4,2
9,absolutely grain experience soon there's proce...,amazon_food,"['bad', 'bad', 'rather_bad']",0.7,-2


In [27]:
columns_map = {
    "INPUT:wordset": "task",
    "assessment": "agg_label"
}

data = data.rename(columns=columns_map)

data.columns

Index(['task', 'INPUT:dataset_name', 'OUTPUT:quality', 'homogenity',
       'agg_label'],
      dtype='object')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3861 entries, 0 to 3860
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INPUT:wordset       3861 non-null   object 
 1   INPUT:dataset_name  3861 non-null   object 
 2   OUTPUT:quality      3861 non-null   object 
 3   homogenity          3861 non-null   float64
 4   assessment          3861 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 150.9+ KB


In [9]:
!nvidia-smi

Wed Apr 17 21:05:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
# Check if cuda is available
assert torch.cuda.is_available(), "Cuda is not available, you cannot use the quantized version of the Llama model without a GPU!"

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"All calculations will be performed on {device}")

All calculations will be performed on cuda


In [11]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
# To use a different branch, change revision
# For example: revision="main"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

## Pipeline of the Inference

In [44]:
def generate_answer(prompt, model, device, temp, n_token):
    torch.cuda.empty_cache()
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    if device=='cpu':
        model.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [45]:
def extract_substring(input_string):
    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [46]:
def get_answer(
        task,
        model,
        device='cpu',
        temp=0.1,
        n_token=500,
        system_prompt='',
        instruction='',
        sample='',
        print_system_prompt=False,
        save_system_prompt=False
        ):

    # Construct prompt
    if task == "classification":
        prompt = f"""<s> [INST] <<SYS>> {system_prompt} <</SYS>> {instruction}\n{sample} [/INST]"""
    if print_system_prompt:
        print(prompt)
    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    if save_system_prompt:
        result = answer[0]
    else:
        result = extract_substring(answer[0])
    return result.replace("\n", " ")

In [51]:
def get_prompt(prompt_type, sample):
    if prompt_type == "P1":
        system_prompt = "You are a useful assistant who evaluates the coherence of words."
        instruction = ("You will receive a list of words, please determine which class the given "
                 "list of words belongs to by answering the question: 'Is it possible to determine "
                 "a common topic for the presented word set or at least for the most part of the set?'.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected\n"
                 "neutral - if it is hard for you to answer on the question.\nPrint only class without explanation and additional information.\n")

        sample = 'Words: ' + sample + "\nClass:"

    return system_prompt, instruction, sample

In [52]:
def get_results(data, model, task, prompt_type, filename, num_iter, device='cpu', temp=0.1, n_token=500,
                shuffle_data=False, remove_old_file=True, print_info=False):
    # Create file
    # data_dir = BASE_DIR.joinpath('matveeva/results')
    data_dir = BASE_DIR.joinpath('data/results')
    res_file = data_dir.joinpath(filename)

    # Remove old file
    if remove_old_file:
        if os.path.exists(res_file):
            os.remove(res_file)

    # Shuffle data
    if shuffle_data:
        data = data.sample(frac=1, random_state=42)

    for i in tqdm(range(num_iter)):
        sample = data.iloc[i]["task"]
        if task == "classification":
            system_prompt, instruction, sample = get_prompt(prompt_type, sample)

            res_true = data.iloc[i]["agg_label"]
            res_model = get_answer(
                task=task,
                model=model,
                device=device,
                temp=temp,
                n_token=n_token,
                system_prompt=system_prompt,
                instruction=instruction,
                sample=sample,
                print_system_prompt=print_info
            )
            if print_info:
                print(res_model)

            with open(res_file, 'a') as file:
                file.write(f"{data.iloc[i]['task']}\t{res_true}\t{res_model}\n")

## Prompt Engineering

### Type I. Chain-of-Thought

In [28]:
system_prompt = "You are a helpful assistant evaluating the top words of a topic model output for a given topic.\n"
instruction = ("You will recieve a list of words. Please rate the following words on a scale from -2 to 2 by answering "
              "on the question: 'Is it possible to determine a common topic for the presented word set or at least "
              "for the most part of the set?'. Use the criteria: '-2' when words seem to be unconnected, "
              "'-1' if the amount of irrelevant words is high to determine a topic or there is a mixture "
              "of topics, '1' if some words are too common or out of topic, '2' if words have a strong "
              "connection between them, '0' if it's hard for you to answer on the question. "
              "Print the answer in the form: Answer: [rating]. Print only answer without explanation.\n")

sample = data.iloc[1]["task"]

get_answer(
    task="classification",
    model=model,
    device=device,
    system_prompt=system_prompt,
    instruction=instruction,
    sample=sample,
    n_token=50,
    print_system_prompt=True
)

<s> [INST] <<SYS>> You are a helpful assistant evaluating the top words of a topic model output for a given topic.
 <</SYS>> You will recieve a list of words. Please rate the following words on a scale from -2 to 2 by answering on the question: 'Is it possible to determine a common topic for the presented word set or at least for the most part of the set?'. Use the criteria: '-2' when words seem to be unconnected, '-1' if the amount of irrelevant words is high to determine a topic or there is a mixture of topics, '1' if some words are too common or out of topic, '2' if words have a strong connection between them, '0' if it's hard for you to answer on the question. Print the answer in the form: Answer: [rating]. Print only answer without explanation.

able amaze fill stop glad daily fantastic move plenty prime empty job subtle strongly litter [/INST]


'  Sure! Here are my ratings for the given list of words:  1. Amaze: 2 2. Fill: 1 3. Stop: 1 4. Glad: 1 5. Daily:'

Conclusion: The model understands the task of describing the data, as well as the role that is required of it, this concept can be used for further research

### Type II

In [55]:
system_prompt = "You are a useful assistant who evaluates the coherence of words.\n"
instruction = ("You will receive a list of words, please determine which class the given "
                 "list of words belongs to by answering the question: 'Is it possible to determine "
                 "a common topic for the presented word set or at least for the most part of the set?'.\n"
                 "Classification rules:\nyes - if words have a strong connection between them\n"
                 "rather yes - if some words are too common or out of topic\n"
                 "rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics\n"
                 "no - when words seem to be unconnected\n"
                 "neutral - if it is hard for you to answer on the question.\nPrint only class without explanation and additional information.\n")

sample = 'Words: ' + data.iloc[4]["task"] + "\nClass:"

get_answer(
    task="classification",
    model=model,
    device=device,
    system_prompt=system_prompt,
    instruction=instruction,
    sample=sample,
    n_token=30,
    print_system_prompt=True
)

<s> [INST] <<SYS>> You are a useful assistant who evaluates the coherence of words.
 <</SYS>> You will receive a list of words, please determine which class the given list of words belongs to by answering the question: 'Is it possible to determine a common topic for the presented word set or at least for the most part of the set?'.
Classification rules:
yes - if words have a strong connection between them
rather yes - if some words are too common or out of topic
rather no - if the amount of irrelevant words is high to determine a topic or there is a mixture of topics
no - when words seem to be unconnected
neutral - if it is hard for you to answer on the question.
Print only class without explanation and additional information.

Words: absolute church truth doug scripture symbol libxmu font bible free undefined type motif error application
Class: [/INST]


"  Sure! Here's the classification of the given list of words:  Class: rather no</s>"

## Inference on the Entire Dataset

### Prompt II

In [None]:
RES_FILE_NAME = "text_coherence_Llama2_13B_4_bit_5classes_P1.csv"

get_results(
    data=data,
    model=model,
    task='classification',
    prompt_type='P1',
    filename=RES_FILE_NAME,
    num_iter=data.shape[0],
    device=device,
    temp=0.1,
    n_token=30,
    shuffle_data=False,
    remove_old_file=True,
    print_info=False
)

 64%|██████▍   | 2490/3861 [1:11:32<44:30,  1.95s/it]