In [None]:
!pip3 install transformers>=4.32.0 optimum>=1.12.0
!pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7

In [1]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import os
import torch
import random
import re

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

## Read Data

In [None]:
# Run this cell if you are using Google Colaboratory

# from google.colab import drive
# drive.mount('/content/drive')

# BASE_DIR = Path("/content/drive/MyDrive/{your_path}")
# DATA_DIR = BASE_DIR.joinpath("data")  # Please upload the data_quality_estim.csv dataset to the data folder 

# FILE_NAME = "data_topic_name_distrib.csv"

# FILE = DATA_DIR.joinpath(FILE_NAME)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
BASE_DIR = Path('Topic_Generation_Llama2_13B_Chat_4bit.ipynb').resolve().parent.parent

DATA_DIR_NAME = 'data/datasets'  # Local directory

DATA_DIR = BASE_DIR.joinpath(DATA_DIR_NAME)
FILE_NAME_QUAL = 'data_topic_name_distrib.csv'

FILE = DATA_DIR.joinpath(FILE_NAME_QUAL)

In [3]:
data = pd.read_csv(FILE)
data.head(10)

Unnamed: 0,INPUT:wordset,OUTPUT:quality,OUTPUT:topic_name,topic_name_num
0,able amaze fill stop glad daily fantastic move...,good,feelings,1
1,absolute mission church nasa mars earth truth ...,good,astrology,1
2,absolute mission church nasa mars earth truth ...,good,nasa,1
3,absolute mission church nasa mars earth truth ...,good,space,1
4,absolute mission church nasa mars earth truth ...,good,space mission,1
5,absolutely fast white rice service noodle terr...,good,customer experience,1
6,absolutely fast white super service customer r...,good,express expedition,1
7,absolutely fast white super service customer r...,good,online shopping,2
8,absolutely fast white super service customer r...,good,review,1
9,absolutely grain there's process terrible expe...,good,comment,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5766 entries, 0 to 5765
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   INPUT:wordset      5766 non-null   object
 1   OUTPUT:quality     5766 non-null   object
 2   OUTPUT:topic_name  5766 non-null   object
 3   topic_name_num     5766 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 180.3+ KB


In [None]:
data_text = list(set(data["INPUT:wordset"]))

In [None]:
!nvidia-smi

Wed Apr  3 23:18:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Check if cuda is available
assert torch.cuda.is_available(), "Cuda is not available, you cannot use the quantized version of the Mistral model without a GPU!"

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

## Pipeline of the Inference

In [None]:
def generate_answer(prompt, model, device, temp, n_token):
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [None]:
def extract_substring(input_string):
    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [None]:
def few_shot_pipeline(
        model,
        system_prompt='',
        instruction='',
        sample='',
        device='cpu',
        temp=0.1,
        n_token=100,
        print_system_prompt=False,
        save_system_prompt=False,
        system_prompt_type="with_system_words"
        ):
    # Construct prompt
    if system_prompt_type == "with_system_words":
        prompt = f"""<s>[INST]<<SYS>> {system_prompt} <</SYS>> {instruction} TEXT: {sample} TOPIC: [/INST]""".strip()
    else:
        prompt = f"""<s> [INST] <<SYS>> {system_prompt} <</SYS>> {instruction} {sample} [/INST]"""
    if print_system_prompt:
        print(prompt)

    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    if save_system_prompt:
        result = answer[0]
    else:
        result = extract_substring(answer[0]).strip()
    return result.replace("\n", "")


## Prompt Engineering

### Type I. Chain-of-Thought

In [None]:
prompt = """<s> [INST] <<SYS>> You are a helpful, respectful and honest assistant for labeling TEXTS. <<SYS>>
You will receive a TEXT and you should analyze it and LABEL it with one or few words. Please, make sure you to only return the LABEL and nothing more.
TEXT: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought
LABEL: [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0].strip())




<s> [INST] <<SYS>> You are a helpful, respectful and honest assistant for labeling TEXTS. <<SYS>>
You will receive a TEXT and you should analyze it and LABEL it with one or few words. Please, make sure you to only return the LABEL and nothing more.
TEXT: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought
LABEL: [/INST]  Sure, I'd be happy to help! Based on the text you provided, I would label it as "NEWS".</s>


Conclusion: The model correctly understands the task and provides an answer in the required form. This product can be used for further research

### Type II.  Chain-of-Thought

In [None]:
prompt = """<s> [INST] <<SYS>> You are an assistant for topic modeling. <<SYS>>
You will receive the TEXT and you should identify one or few TOPICS of the TEXT. Provide ONLY TOPICS and nothing more.
TEXT: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought
TOPIC: [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0].strip())


<s> [INST] <<SYS>> You are an assistant for topic modeling. <<SYS>>
You will receive the TEXT and you should identify one or few TOPICS of the TEXT. Provide ONLY TOPICS and nothing more.
TEXT: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought
TOPIC: [/INST]  Sure! Here are the topics I identified in the text you provided:

1. Anonymity
2. Privacy
3. Crime
4. Religion

Please let me know if you would like me to identify additional topics or if you have any further questions!</s>


Conclusion: The model also successfully highlights topics, but in this case post-processing will be necessary

### Type III. Chain-of-Thought

In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: anonymous public private news crime server religion carry pretty bill cview internet paul hold thought [/INST]  Sure! Based on the list of words provided, the main theme that combines all the words is "privacy."

So, the single word that represents the main theme is:

"Privacy"</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: greek turkish turkey cross armenia henrik turk greece plane azeri link cyprus allocation minority conflict [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: greek turkish turkey cross armenia henrik turk greece plane azeri link cyprus allocation minority conflict [/INST]  Sure, I'd be happy to help! Based on the list of words you provided, the main theme that combines all the words is:

"CONFLICT"</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should undersatnd the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: bar sweet crunchy yummy cinnamon granola yogurt crunch cereal raisin flake snack almond combination blueberry [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should undersatnd the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: bar sweet crunchy yummy cinnamon granola yogurt crunch cereal raisin flake snack almond combination blueberry [/INST]  Sure, I'd be happy to help! Based on the list of words you provided, the main theme that combines all the words is "breakfast".

So, the single word that I would print in quotes is:

"Breakfast"</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: church bible christ catholic matthew faith lord scripture father word spirit holy paul passage interpretation [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: church bible christ catholic matthew faith lord scripture father word spirit holy paul passage interpretation [/INST]  Sure! Based on the list of words provided, the main theme that combines all the words is:

"FAITH"</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: internet research phone center email institute organization technology office voice address department virginia national engineering[/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: internet research phone center email institute organization technology office voice address department virginia national engineering[/INST]  Sure! Based on the list of words provided, the main theme that combines all the words is "TECHNOLOGY".</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: driver card sound light cards machine utexas diamond gateway mode running latest crash accelerator austin [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: driver card sound light cards machine utexas diamond gateway mode running latest crash accelerator austin [/INST]  Sure! Based on the list of words you provided, the main theme that combines all the words is "TECHNOLOGY".</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: star fruit pretty juice gift orange mind soda instant five espresso substitute raspberry decent via [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: star fruit pretty juice gift orange mind soda instant five espresso substitute raspberry decent via [/INST]  Sure! Based on the list of words you provided, the main theme that combines all the words is "drinks".

So, the single word that represents the theme is:

"Drinks"</s>


In [None]:
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: quality chicken meat ingredient vegetable purchasing salmon hazelnut pricey base turkey tuna filler steak actually [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <<\SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: quality chicken meat ingredient vegetable purchasing salmon hazelnut pricey base turkey tuna filler steak actually [/INST]  Sure! Based on the list of words you provided, the main theme that combines all the words is "FOOD".</s>


Conclusion: The model understands what is required of it and gives the answer in the correct form. This prompt can be used for further research

In [None]:
# Example with EXTRACT
prompt = """<s> [INST] <<SYS>> You are the topic modeling assistant. <</SYS>> You should extract the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: quality chicken meat ingredient vegetable purchasing salmon hazelnut pricey base turkey tuna filler steak actually [/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])


<s> [INST] <<SYS>> You are the topic modeling assistant. <</SYS>> You should extract the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: quality chicken meat ingredient vegetable purchasing salmon hazelnut pricey base turkey tuna filler steak actually [/INST]  Sure! Based on the list of words you provided, the main theme that combines all the words is:

"QUALITY"</s>


Conclusion: Based on the generation results, it can be seen that when using the word extract in the prompt, the model simply extracts some word from the set, while when using understand, it generalizes and outputs a topic

## Inference on the Entire Dataset

### Type II.  Chain-of-Thought

In [None]:
random.seed(42)
random.shuffle(data_text)

In [None]:
SYSTEM_PROMPT = "You are an assistant for topic modeling."
INSTRUCTION = "You will receive the TEXT and you should identify one or few TOPICS of the TEXT. Provide ONLY TOPIC and nothing more."

SAMPLE = data_text[2]
print(SAMPLE)

res = few_shot_pipeline(model, SYSTEM_PROMPT, INSTRUCTION, SAMPLE, device="cuda", n_token=100, print_system_prompt=True)

res.strip()

disk card scsi driver speed fast apple controller machine memory monitor port chip board sound
<s>[INST]<<SYS>> You are an assistant for topic modeling. <</SYS>> You will receive the TEXT and you should identify one or few TOPICS of the TEXT. Provide ONLY TOPIC and nothing more. TEXT: disk card scsi driver speed fast apple controller machine memory monitor port chip board sound TOPIC: [/INST]


"Sure! Here's the topic of the text you provided:TOPIC: Computer Hardware</s>"

In [None]:
# Create file
RES_FILE_NAME = "data_topic_name_distrib_llama_13B_4bit_type1.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# Remove old file
if os.path.exists(RES_FILE):
    os.remove(RES_FILE)

for i in tqdm(range(len(data_text))):
    SAMPLE = data_text[i]
    res_model = few_shot_pipeline(
        model,
        SYSTEM_PROMPT,
        INSTRUCTION,
        SAMPLE,
        device="cuda",
        temp=0.1,
        n_token=100
    )
    with open(RES_FILE, 'a') as file:
        file.write(f"{SAMPLE};{res_model}\n")


100%|██████████| 2363/2363 [1:03:39<00:00,  1.62s/it]


### Type III.  Chain-of-Thought

In [None]:
random.seed(42)
random.shuffle(data_text)

In [None]:
def extract_characters(text):
    result = re.findall(r"[A-Za-z0-9]+|[.,!?;:()\"'-]", text)
    return ' '.join(result)

In [None]:
SYSTEM_PROMPT = "You are the topic modeling assistant."
INSTRUCTION = "You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text:"

SAMPLE = data_text[391]

res = few_shot_pipeline(
    model,
    SYSTEM_PROMPT,
    INSTRUCTION,
    SAMPLE,
    device="cuda",
    n_token=100,
    print_system_prompt=True,
    system_prompt_type=None
  )

res

<s> [INST] <<SYS>> You are the topic modeling assistant. <</SYS>> You should understand the main theme that combines all the words from the list. Formulate it with a single word, print only this word in quotes and no additional text: drinking aftertaste mild espresso might weird beverage sip latte brewing guest appeal however sound mellow [/INST]


'Sure! Based on the list of words you provided, the main theme that combines all the words is "coffee." Here\'s the word formulated as a single quote:"Coffee"</s>'

In [None]:
# Create file
RES_FILE_NAME = "data_topic_name_distrib_llama_13B_4bit_new.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)
# Remove old file
if os.path.exists(RES_FILE):
    os.remove(RES_FILE)

for i in tqdm(range(len(data_text))):
    torch.cuda.empty_cache()
    SAMPLE = data_text[i]
    res_model = few_shot_pipeline(
        model,
        SYSTEM_PROMPT,
        INSTRUCTION,
        SAMPLE,
        device="cuda",
        temp=0.1,
        n_token=100
    )
    res_model = extract_characters(res_model.replace("</s>", ""))
    # print(res_model)
    with open(RES_FILE, 'a') as file:
        file.write(f"{SAMPLE};{res_model}\n")


100%|██████████| 1971/1971 [1:10:59<00:00,  2.16s/it]
