In [1]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

In [2]:
from pathlib import Path
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import re
import os
from tqdm import tqdm

## Read Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
BASE_DIR = Path("/content/drive/MyDrive/ITMO/RW'23")
DATA_DIR = BASE_DIR.joinpath("data")

FILE_NAME = "data_irrelative_topics.csv"

FILE = DATA_DIR.joinpath(FILE_NAME)

In [5]:
data = pd.read_csv(FILE)
data.head(10)

Unnamed: 0.1,Unnamed: 0,wordset,all_topics,unsuitable_topics,suitable_topics
0,0,islamic muslim ohio kent sandvik cheer islam a...,"['islamic', 'muslim', 'ohio', 'kent', 'sandvik...",[],"['islam', 'tube', 'sandvik', 'ohio', 'upenn', ..."
1,1,chip blend roast com favorite www http href da...,"['chip', 'blend', 'roast', 'com', 'favorite', ...","['k-cups', 'blend', 'chip']","['href', 'pod', 'com', 'favorite', 'roast', 'd..."
2,2,window display color screen manager server app...,"['window', 'display', 'color', 'screen', 'mana...","['client', 'colors', 'colormap', 'manager']","['size', 'application', 'memory', 'screen', 'm..."
3,3,nasa mission launch surface shuttle radar dete...,"['nasa', 'mission', 'launch', 'surface', 'shut...","['april', 'three']","['orbiter', 'probe', 'shuttle', 'field', 'nasa..."
4,4,church bible christ catholic faith matthew lor...,"['church', 'bible', 'christ', 'catholic', 'fai...","['matthew', 'word', 'holy']","['father', 'catholic', 'christ', 'scripture', ..."
5,5,bike firearm knife criminal carry death weapon...,"['bike', 'firearm', 'knife', 'criminal', 'carr...","['scsi', 'bike', 'smith', 'tape', 'child', 'mo...","['criminal', 'firearm', 'carry', 'handgun', 'k..."
6,6,company available care awesome search sent lic...,"['company', 'available', 'care', 'awesome', 's...","['awesome', 'care']","['licorice', 'call', 'company', 'business', 's..."
7,7,firearm bill weapon crime vote handgun public ...,"['firearm', 'bill', 'weapon', 'crime', 'vote',...","['vote', 'washington', 'news', 'kent']","['laws', 'criminal', 'federal', 'firearm', 'bi..."
8,8,clinton mike batf class bill within proper wac...,"['clinton', 'mike', 'batf', 'class', 'bill', '...",[],"['unknown', 'clinton', 'proper', 'bill', 'batf..."
9,9,almond body smooth blue magnesium cause diamon...,"['almond', 'body', 'smooth', 'blue', 'magnesiu...",[],"['blood', 'magnesium', 'rda', 'diamond', 'smoo..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         2363 non-null   int64 
 1   wordset            2363 non-null   object
 2   all_topics         2363 non-null   object
 3   unsuitable_topics  2363 non-null   object
 4   suitable_topics    2363 non-null   object
dtypes: int64(1), object(4)
memory usage: 92.4+ KB


##Mistral-7B

In [7]:
!nvidia-smi

Fri Jan  5 21:29:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [9]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##Inference

In [10]:
def generate_answer(prompt, model, device, temp, n_token):
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [11]:
def extract_substring(input_string):
    # pattern = r'\[/INST\](.*?)</s>'
    # match = re.search(pattern, input_string)
    # if match:
    #     return match.group(1)
    # else:
    #     return None

    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [22]:
def few_shot_pipeline(instruction, sample, model, device='cpu', temp=0.1, n_token=100):
    # Construct prompt
    prompt = f"""[INST] {instruction}\n
    TEXT: {sample[0]}
    TOPICS: {sample[1]}
    ANSWER:[/INST]"""
    # print(prompt)
    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    # print(answer[0])
    result = extract_substring(answer[0])
    return result


####Type I

In [None]:
prompt = """[INST] Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, colormap, manager

Text: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
Topics: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
Answer: [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


[INST] Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, colormap, manager

Text: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
Topics: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, prob

####Type II

In [16]:
prompt = """[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, colormap, manager

Text: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
Topics: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
Answer: [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=200)

print(answer[0])

[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, colormap, manager

Text: nasa mission launch surface shuttle radar detec

In [17]:
prompt = """[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”. Please, make sure you to only return list of topics and nothing more.

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, colormap, manager

Text: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
Topics: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
Answer: [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])

[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”. Please, make sure you to only return list of topics and nothing more.

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer: All words are related to the topic

Text: chip blend roast com favorite www http href dark keurig k-cups brew bold rich pod
Topics: chip, blend, roast, com, favorite, www, http, href, dark, keurig, k-cups, brew, bold, rich, pod
Answer: k-cups, blend, chip

Text: window display color screen manager server application visual mode memory size client colors colormap default
Topics: window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default
Answer: client, colors, c

In [39]:
prompt = """[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”

Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer:[/INST] "islamic", "muslim", "ohio", "kent", "sandvik", "cheer", "islam", "apple", "newton", "upenn", "magnus", "tube", "activity", "rice", "private"

All words are related to the topic.</s>


In [45]:
prompt = """[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.
Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.3, n_token=100)

print(answer[0])

[INST] Mark the words that you think are out of the topic. If you think that all the words describe this set, print the message: “All words are related to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.
Text: islamic muslim ohio kent sandvik cheer islam apple newton upenn magnus tube activity rice private
Topics: islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private
Answer:[/INST] The words "islamic", "muslim", "islam", "apple", "newton", "upenn", "magnus", "tube", "activity", and "rice" are all related to the topic. The words "kent", "sandvik", "cheer", and "private" are not related to the topic.</s>


####Type III

In [50]:
prompt = """[INST] You will receive two lists: TEXT and TOPICS. You should mark all the words from the TOPICS list that are out the topic that describe the TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You will receive two lists: TEXT and TOPICS. You should mark all the words from the TOPICS list that are out the topic that describe the TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST] The words from the TOPICS list that do not relate to the TEXT are: "three" and "Pasadena".</s>


In [53]:
# +
prompt = """[INST] You will receive two lists: TEXT and TOPICS. You should mark all the words from the TOPICS list that are out the topic that describe the TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic". Please, make sure you to only return the list of unsuitable topics and nothing more.
TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You will receive two lists: TEXT and TOPICS. You should mark all the words from the TOPICS list that are out the topic that describe the TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic". Please, make sure you to only return the list of unsuitable topics and nothing more.
TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST] The words from the TOPICS list that are not related to the TEXT are: "three", "pasadena".</s>


####Type IV

In [103]:
prompt = """[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that do not describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that do not describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST] The words from the list of TOPICS that do not describe TEXT are: "three", "pasadena".</s>


In [108]:
prompt = """[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that do not describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that do not describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER:[/INST] "All the words relate to the topic."</s>


####Type V

In [62]:
prompt = """[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER: [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic.”

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER: [/INST] "All the words relate to the topic."</s>


In [63]:
prompt = """[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER: [/INST]"""

answer = generate_answer(prompt, model, 'cuda', temp=0.2, n_token=100)

print(answer[0])

[INST] You are a helpful, respectful and honest assistant for labeling topics. You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that describe TEXT. If you think that all the words describe this TEXT, print the message: “All the words relate to the topic”. Please, make sure you to only return the list of unsuitable topics and nothing more.

TEXT: nasa mission launch surface shuttle radar detector atmosphere field orbiter probe april ozone three pasadena
TOPICS: nasa, mission, launch, surface, shuttle, radar, detector, atmosphere, field, orbiter, probe, april, ozone, three, pasadena
ANSWER: [/INST] "All the words relate to the topic"</s>


##Entire dataset

In [13]:
data["all_topics"] = [", ".join(data.iloc[i]["all_topics"][2:-2].split("', '")) for i in range(len(data))]

In [83]:
", ".join(data.iloc[0]["all_topics"][2:-2].split("', '"))

'islamic, muslim, ohio, kent, sandvik, cheer, islam, apple, newton, upenn, magnus, tube, activity, rice, private'

In [87]:
data.iloc[2]["all_topics"]

'window, display, color, screen, manager, server, application, visual, mode, memory, size, client, colors, colormap, default'

In [14]:
test_data = data.sample(frac=1, random_state=42)

In [15]:
test_data.head(10)

Unnamed: 0.1,Unnamed: 0,wordset,all_topics,unsuitable_topics,suitable_topics
1138,1138,scsi disk controller card floppy port fast spe...,"scsi, disk, controller, card, floppy, port, fa...","['jumper', 'board', 'floppy', 'mode', 'bios']","['disk', 'scsi', 'port', 'transfer', 'interfac..."
1628,1628,size baby quickly daughter thanks night start ...,"size, baby, quickly, daughter, thanks, night, ...",['since'],"['month', 'size', 'move', 'baby', 'daughter', ..."
1606,1606,snack bar cooky cereal butter fruit peanut nut...,"snack, bar, cooky, cereal, butter, fruit, pean...",['butter'],"['cracker', 'fruit', 'bar', 'cereal', 'strawbe..."
1977,1977,crime criminal kill death self public murder c...,"crime, criminal, kill, death, self, public, mu...","['situation', 'indiv']","['criminal', 'firearm', 'self', 'public', 'com..."
1526,1526,sweet bar crunchy granola yogurt raisin flake ...,"sweet, bar, crunchy, granola, yogurt, raisin, ...",['cluster'],"['raisin', 'wafer', 'bar', 'flake', 'granola',..."
218,218,organic natural real expensive high syrup whea...,"organic, natural, real, expensive, high, syrup...","['pleased', 'corn', 'wheat', 'maple', 'organic...",[]
1398,1398,ground wire connect circuit panel outlet wirin...,"ground, wire, connect, circuit, panel, outlet,...","['math', 'swap', 'electrical', 'duke', 'screw'...","['outlet', 'circuit', 'neutral', 'panel', 'cab..."
252,252,bible church christ faith matthew catholic scr...,"truth, spirit, word, faith, christ, church, sc...","['scripture', 'word']","['father', 'matthew', 'holy', 'heaven', 'catho..."
1922,1922,wire ground outlet wiring neutral math panel c...,"wire, ground, outlet, wiring, neutral, math, p...","['outlet', 'math', 'neutral', 'circuit', 'pane...",['ground']
643,643,review smell bad know i'll since people disapp...,"review, smell, bad, know', ""i'll"", 'since, peo...",['seal'],"['open', 'smell', 'disappointed', 'agree', 'si..."


In [20]:
INSTRUCTION = """You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that don't describe TEXT and print this words in one line separated by commas. If you think that all the words from the TOPICS describe TEXT, print the message: “All words are related to the topic”."""

SAMPLE = (test_data.iloc[0]["wordset"], test_data.iloc[10]["all_topics"])

res = few_shot_pipeline(INSTRUCTION, SAMPLE, model, device="cuda", temp=0.2)

# print(SAMPLE)

res

[INST] You will receive two lists: TEXT and TOPICS. You should find all the words from the list of TOPICS that don't describe TEXT and print this words in one line separated by commas. If you think that all the words from the TOPICS describe TEXT, print the message: “All words are related to the topic”.

    TEXT: scsi disk controller card floppy port fast speed interface transfer bios board mode jumper feature
    TOPICS: offer, sale, copy, monitor, sell, xterm, price, machine, sequence, client, title, upgrade, option, asking, running
    ANSWER:[/INST]


' offer, sale, copy, monitor, sell, xterm, price, machine, sequence, client, title, upgrade, option, asking, running</s>'

In [23]:
# Create file
RES_FILE_NAME = "data_irrelative_topics_mistral_7b_4bit.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# Remove old file
# if os.path.exists(RES_FILE):
#     os.remove(RES_FILE)

for i in tqdm(range(len(test_data))):
# for i in tqdm(range(5)):
    SAMPLE = (test_data.iloc[i]["wordset"], test_data.iloc[i]["all_topics"])
    res_true = test_data.iloc[i]["unsuitable_topics"]
    res_model = few_shot_pipeline(
        INSTRUCTION,
        SAMPLE,
        model,
        device="cuda",
        temp=0.3,
        n_token=100
    )
    with open(RES_FILE, 'a') as file:
      file.write(f"{SAMPLE[0]};{SAMPLE[1]};{res_true};{res_model}\n")


 69%|██████▊   | 1619/2363 [1:27:34<40:14,  3.25s/it]


FileNotFoundError: [Errno 2] No such file or directory: "/content/drive/MyDrive/ITMO/RW'23/data/data_irrelative_topics_mistral_7b_4bit.csv"