In [1]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from pathlib import Path
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import re
import os
from tqdm import tqdm

## Read Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
BASE_DIR = Path("/content/drive/MyDrive/ITMO/RW'23")
DATA_DIR = BASE_DIR.joinpath("data")

FILE_NAME = "data_quality_estim.csv"

FILE = DATA_DIR.joinpath(FILE_NAME)

In [5]:
data = pd.read_csv(FILE)
data.head(10)

Unnamed: 0,task,agg_label
0,able amaze fill glad stop daily fantastic move...,bad
1,able amaze fill stop glad daily fantastic move...,bad
2,able amaze fill stop glad daily fantastic move...,bad
3,able amaze glad fill stop fantastic daily move...,bad
4,absolute church truth doug scripture symbol li...,good
5,absolute church truth mission nasa doug mars e...,bad
6,absolute mission church nasa mars earth truth ...,good
7,absolutely fast white rice service noodle terr...,bad
8,absolutely fast white super service customer r...,good
9,absolutely grain experience soon there's proce...,bad


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3861 entries, 0 to 3860
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   task       3861 non-null   object
 1   agg_label  3861 non-null   object
dtypes: object(2)
memory usage: 60.5+ KB


##Mistral-7B

In [7]:
!nvidia-smi

Sat Jan 13 21:48:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [9]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

##Inference

In [10]:
def generate_answer(prompt, model, device, temp, n_token):
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [11]:
def extract_substring(input_string):
    # pattern = r'\[/INST\](.*?)</s>'
    # match = re.search(pattern, input_string)
    # if match:
    #     return match.group(1)
    # else:
    #     return None

    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [12]:
def few_shot_pipeline(instruction, sample, model, device='cpu', temp=0.1, n_token=100):
    # Construct prompt
    prompt = f"""<s> [INST] {instruction}\n{sample} // [/INST]"""
    # prompt_temp = [
    #     {
    #         "role": "user",
    #         "content": f"{instruction}\nTEXT: {sample}\n",
    #     }
    # ]
    # print(prompt)
    # prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    # print(prompt)
    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    # print(answer[0])
    result = extract_substring(answer[0])
    return result.replace("\n", "")


####Type I

In [None]:
prompt = """[INST] bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // good
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // bad
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // good
april nasa star canada bike score always goal maynard area player hockey wings center white // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // good 
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // bad
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // good
april nasa star canada bike score always goal maynard area player hockey wings center white // [/INST] It seems like you are trying to generate a list of words that are related to certain topics. Here are some suggestions based on the words you provided:

* Bike: bicycle, cycling, bike path, bike helmet, bike repair
* Firearm: gun, pistol, rifle, shotgun, ammunition
* Knife: knife, blade, cutting tool, survival knife, pocket knife
* Criminal: criminal justice, law enforcement, criminal record, criminal behavior


####Type II

In [None]:
prompt = """[INST] bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // strong connection

appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // weak connection

bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // strong connection

april nasa star canada bike score always goal maynard area player hockey wings center white // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // strong connection

appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // weak connection

bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // strong connection

april nasa star canada bike score always goal maynard area player hockey wings center white // [/INST] It seems like you are trying to create a sentence that connects various words and phrases. Here's a possible sentence that incorporates some of the words you provided:

"The Bible teaches us that the Lord is our ultimate defense, and we should always strive to be strong in our faith, just like the hockey player who always scores goals for his team."

This sentence connects the concepts of faith, defense, and hockey, and uses the words "Bible," "


####Type III

In [None]:
prompt = """[INST] You will receive a list of words, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the presented word set or at least for the most part of the set?"
bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
apple offer monitor sale price shipping pitt selling gordon banks sell purchase dealer sony excellent // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=2)

print(answer[0])

[INST] You will receive a list of words, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the presented word set or at least for the most part of the set?"
bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
apple offer monitor sale price shipping pitt selling gordon banks sell purchase dealer sony excellent // [/INST] YES


####Type IV

In [None]:
# Unstable prompt
prompt = """[INST] You will receive a list of words, and you should answer "YES" if words have a strong connection between them or “NO” in the opposite case.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] You will receive a list of words, and you should answer "YES" if words have a strong connection between them or “NO” in the opposite case.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] YES
NO</s>


In [None]:
prompt = """[INST] You will receive a list of words, and you should answer "YES" if words have a strong connection between them or “NO” in the opposite case.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=2)

print(answer[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] You will receive a list of words, and you should answer "YES" if words have a strong connection between them or “NO” in the opposite case.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] YES


####Type V

In [None]:
prompt = """[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] YES. The provided set of tokens is related to the topic of crime and violence, as many of the words are associated with weapons, criminal activities, and death

In [None]:
prompt = """[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=2)

print(answer[0])

[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] YES


In [None]:
prompt = """[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] YES</s>


####Type VI

In [None]:
prompt = """[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] NO. It is not possible to determine a common topic for the presented word set. The words seem to be unconnected and do not have a strong conn

In [None]:
prompt = """[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=2)

print(answer[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”. 
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] NO.


In [None]:
prompt = """[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] You are a human. Select one of four categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES
anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas // [/INST] NO</s>


### Type VII

In [None]:
prompt_temp = [
    {
        "role": "user",
        "content": "You are a helpful, respectful and honest assistant for text classfification. You will receive a TEXT, and you should answer 'YES' if words in the TEXT have a connection between them or 'NO' in the opposite case. Please, make sure you to only return YES or NO and nothing more. TEXT: anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas",
    }
 ]
prompt = tokenizer.apply_chat_template(prompt_temp, tokenize=False, add_generation_prompt=True)

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)
print(answer[0])

<s> [INST] You are a helpful, respectful and honest assistant for text classfification. You will receive a TEXT, and you should answer 'YES' if words in the TEXT have a connection between them or 'NO' in the opposite case. Please, make sure you to only return YES or NO and nothing more. TEXT: anything juice guess wrong throw picture immediately actual concentrate squeeze strip think photo funny pas [/INST] NO</s>


##Type III -> All dataset

In [13]:
INSTRUCTION = """You will receive a list of words, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the presented word set or at least for the most part of the set?"
bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES"""

SAMPLE = data.iloc[0]["task"]

res = few_shot_pipeline(INSTRUCTION, SAMPLE, model, device="cuda")

res

' YES</s>'

In [14]:
test_data = data.sample(frac=1, random_state=42)

In [15]:
test_data

Unnamed: 0,task,agg_label
2424,pack variety com www http href stash pie grey ...,bad
3290,south island ship secret military rutgers brit...,bad
2575,piece beef jerky everything season break pill ...,bad
2042,mix cream ice pour strawberry stir pancake waf...,good
3717,window display screen color server application...,good
...,...,...
1130,engine model rocket band lock foot exhaust pre...,good
1294,font print character page printer vice cell bi...,good
860,delicious small dry hard smaller surprised liv...,bad
3507,thought end else stick cover boil air china bu...,bad


In [None]:
# Create file
RES_FILE_NAME = "data_qality_estim_mistral_7b_4bit_type3.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)

res_df = pd.read_csv(RES_FILE, names=["text", "true", "model_res"], sep=";")

In [None]:
res_df

Unnamed: 0,text,true,model_res
0,pack variety com www http href stash pie grey ...,bad,NO</s>
1,south island ship secret military rutgers brit...,bad,NO</s>
2,piece beef jerky everything season break pill ...,bad,NO</s>
3,mix cream ice pour strawberry stir pancake waf...,good,NO</s>
4,window display screen color server application...,good,NO</s>
5,magnesium oil olive energy diamond bowel blue ...,good,NO</s>
6,encryption chip clipper security escrow secure...,good,NO</s>
7,dry liver freeze gravy movie walk cube lick st...,bad,NO</s>
8,bone bubble rawh oat gum chewer miso slim chin...,bad,NO</s>
9,year month friend husband gift weight ago lose...,bad,NO</s>


In [None]:
test_data.iloc[1795]

task         launch entry satellite rule remark build europ...
agg_label                                                  bad
Name: 1872, dtype: object

In [18]:
# Create file
RES_FILE_NAME = "data_qality_estim_mistral_7b_4bit_type3.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# Remove old file
if os.path.exists(RES_FILE):
    os.remove(RES_FILE)

for i in tqdm(range(len(test_data))):
# for i in tqdm(range(100)):
    SAMPLE = test_data.iloc[i]["task"]
    res_true = test_data.iloc[i]["agg_label"]
    res_model = few_shot_pipeline(
        INSTRUCTION,
        SAMPLE,
        model,
        device="cuda",
        temp=0.1,
        n_token=200
    )
    with open(RES_FILE, 'a') as file:
        file.write(f"{SAMPLE};{res_true};{res_model}\n")


100%|██████████| 3861/3861 [3:12:10<00:00,  2.99s/it]


##Type V Entire Dataset

In [None]:
INSTRUCTION = """Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES"""

SAMPLE = data.iloc[80]["task"]

res = few_shot_pipeline(INSTRUCTION, SAMPLE, model, device="cuda", n_token=100)

res

' YES</s>'

In [None]:
test_data = data.sample(frac=1, random_state=42)

In [None]:
test_data

Unnamed: 0,task,agg_label
2424,pack variety com www http href stash pie grey ...,bad
3290,south island ship secret military rutgers brit...,bad
2575,piece beef jerky everything season break pill ...,bad
2042,mix cream ice pour strawberry stir pancake waf...,good
3717,window display screen color server application...,good
...,...,...
1130,engine model rocket band lock foot exhaust pre...,good
1294,font print character page printer vice cell bi...,good
860,delicious small dry hard smaller surprised liv...,bad
3507,thought end else stick cover boil air china bu...,bad


In [None]:
# Create file
RES_FILE_NAME = "data_qality_estim_mistral_7b_4bit_type5.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# # Remove old file
# if os.path.exists(RES_FILE):
#     os.remove(RES_FILE)
with open(RES_FILE, 'a') as file:
    for i in tqdm(range(len(test_data))):
    # for i in tqdm(range(100)):
        SAMPLE = test_data.iloc[i]["task"]
        res_true = test_data.iloc[i]["agg_label"]
        res_model = few_shot_pipeline(
            INSTRUCTION,
            SAMPLE,
            model,
            device="cuda",
            temp=0.1,
            n_token=100
        )
        file.write(f"{SAMPLE};{res_true};{res_model}\n")


100%|██████████| 3861/3861 [2:04:51<00:00,  1.94s/it]


##Type VI Entire Dataset

In [None]:
INSTRUCTION = """You are a human. Select one of two categories-characteristics of the provided set of tokens by answering the question ”Is it possible to determine a common topic for the presented word set or at least for the most part of the set?”.
You could answer one of the following: YES - if you agree with the statement and words have a strong connection between them, NO - when words seem to be unconnected. Please, make sure you to only return YES or NO and nothing more.

bike firearm knife criminal carry death weapon scsi murder smith handgun moral child defense tape // YES
appreciate advance anybody info thank experience hello greatly suggestion wonder edge able email recently thanx // NO
bible christ matthew heaven scripture lord father word passage teach mary spirit doctrine verse faith // YES"""

SAMPLE = data.iloc[0]["task"]

res = few_shot_pipeline(INSTRUCTION, SAMPLE, model, device="cuda", n_token=100)

res

' YES</s>'

In [None]:
test_data = data.sample(frac=1, random_state=42)

In [None]:
# Create file
RES_FILE_NAME = "data_qality_estim_mistral_7b_4bit_type6.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# Remove old file
if os.path.exists(RES_FILE):
    os.remove(RES_FILE)
with open(RES_FILE, 'a') as file:
    for i in tqdm(range(len(test_data))):
    # for i in tqdm(range(100)):
        SAMPLE = test_data.iloc[i]["task"]
        res_true = test_data.iloc[i]["agg_label"]
        res_model = few_shot_pipeline(
            INSTRUCTION,
            SAMPLE,
            model,
            device="cuda",
            temp=0.1,
            n_token=100
        )
        file.write(f"{SAMPLE};{res_true};{res_model}\n")


 94%|█████████▍| 3635/3861 [1:58:47<07:31,  2.00s/it]