In [None]:
!pip3 install transformers>=4.32.0 optimum>=1.12.0
!pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7

In [2]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import os
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

## Read Data

In [6]:
# Run this cell if you are using Google Colaboratory

# from google.colab import drive
# drive.mount('/content/drive')

# BASE_DIR = Path(f"/content/drive/MyDrive/{your_path}")
# DATA_DIR = BASE_DIR.joinpath("data")  # Please upload the data_quality_estim.csv dataset to the data folder 

# FILE_NAME = "data_quality_estim.csv"

# FILE = DATA_DIR.joinpath(FILE_NAME)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
BASE_DIR = Path('Coherence_of_Words_2classes_Llama_13B_Chat_4bit.ipynb').resolve().parent.parent

DATA_DIR_NAME = 'data/datasets'  # Local directory

DATA_DIR = BASE_DIR.joinpath(DATA_DIR_NAME)
FILE_NAME_QUAL = 'data_quality_estim.csv'

FILE = DATA_DIR.joinpath(FILE_NAME_QUAL)

In [4]:
data = pd.read_csv(FILE)
data.head(10)

Unnamed: 0,task,agg_label
0,able amaze fill glad stop daily fantastic move...,bad
1,able amaze fill stop glad daily fantastic move...,bad
2,able amaze fill stop glad daily fantastic move...,bad
3,able amaze glad fill stop fantastic daily move...,bad
4,absolute church truth doug scripture symbol li...,good
5,absolute church truth mission nasa doug mars e...,bad
6,absolute mission church nasa mars earth truth ...,good
7,absolutely fast white rice service noodle terr...,bad
8,absolutely fast white super service customer r...,good
9,absolutely grain experience soon there's proce...,bad


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3861 entries, 0 to 3860
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   task       3861 non-null   object
 1   agg_label  3861 non-null   object
dtypes: object(2)
memory usage: 60.5+ KB


In [9]:
!nvidia-smi

Mon Mar 25 18:39:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
# Check if cuda is available
assert torch.cuda.is_available(), "Cuda is not available, you cannot use the quantized version of the Llama model without a GPU!"

In [10]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
# To use a different branch, change revision
# For example: revision="main"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

## Pipeline of the Inference

In [12]:
def generate_answer(prompt, model, device, temp, n_token):
    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    model_input = model_input.to(device)
    generated_ids = model.generate(
        **model_input, do_sample=True,
        max_new_tokens=n_token,
        temperature=temp,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        num_beams=4
        )
    decoded = tokenizer.batch_decode(generated_ids)

    return decoded

In [13]:
def extract_substring(input_string):
    index = input_string.find("[/INST]")
    if index != -1:
        return input_string[index + len("[/INST]"):]
    else:
        return ""

In [15]:
def few_shot_pipeline(
        model,
        system_prompt='',
        instruction='',
        sample='',
        device='cpu',
        temp=0.1,
        n_token=100,
        print_system_prompt=False,
        save_system_prompt=False
        ):
    # Construct prompt
    prompt = f"""<s> [INST] <<SYS>> {system_prompt} <</SYS>> {instruction} TEXT: {sample} ANSWER: [/INST]"""
    if print_system_prompt:
        print(prompt)
    # Generate answer
    answer = generate_answer(prompt, model, device, temp, n_token)
    # Extract the result
    if save_system_prompt:
        result = answer[0]
    else:
        result = extract_substring(answer[0]).strip()
    return result.replace("\n", "")


## Prompt Engineering

### Type I. Chain-of-Thought

In [16]:
prompt = """<s> [INST] <<SYS>> You are the assistant for text classification <<SYS>>
You will receive a TEXT, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the TEXT or at least for the most part of the TEXT?". Please, make sure you to only return YES or NO and nothing more.
TEXT: april nasa star canada bike score always goal maynard area player hockey wings center white
ANSWER:[/INST]""".strip()

answer = generate_answer(prompt, model, 'cuda', temp=0.1, n_token=100)

print(answer[0])




<s> [INST] <<SYS>> You are the assistant for text classification <<SYS>>
You will receive a TEXT, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the TEXT or at least for the most part of the TEXT?". Please, make sure you to only return YES or NO and nothing more.
TEXT: april nasa star canada bike score always goal maynard area player hockey wings center white
ANSWER:[/INST]  NO.</s>


Conclusion: The model understands the task of describing the data, as well as the role that is required of it, this concept can be used for further research

### Low-level code for text generation

In [23]:
prompt = """<<SYS>> You are the assistant for text classification <</SYS>>
You will receive a TEXT, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the TEXT or at least for the most part of the TEXT?". Please, make sure you to only return YES or NO and nothing more.

TEXT: april nasa star canada bike score always goal maynard area player hockey wings center white
ANSWER:""".strip()

print(prompt)

voc = tokenizer.get_vocab()
voc_rev = {v:k for k, v in voc.items()}

for i in range(2):
    inputs = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to('cuda')
    logits = model.forward(**inputs).logits[0, -1, :]
    probs = torch.nn.functional.softmax(logits, dim=-1)
    next_token_id = torch.multinomial(probs.flatten(), num_samples=1)

    next_token = tokenizer.decode(next_token_id)
    print("next token", next_token)
    prompt += next_token


    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    top_tokens = sorted_indices[:5]
    print(f'Step #{i} candidates:')
    for t, p in zip(top_tokens, sorted_probs):
        t = voc_rev[t.item()]
        print(f"{t}: {p:.4f} ")

    print(f"\nChosen token: (next_token)", end="\n\n", flush=True)

<<SYS>> You are the assistant for text classification <</SYS>>
You will receive a TEXT, and you should answer "YES" or "NO" to the question: "Is it possible to determine a common topic for the TEXT or at least for the most part of the TEXT?". Please, make sure you to only return YES or NO and nothing more.

TEXT: april nasa star canada bike score always goal maynard area player hockey wings center white
ANSWER:
next token YES
Step #0 candidates:
▁YES: 0.3445 
▁NO: 0.2856 
<0x0A>: 0.2259 
▁: 0.0445 
▁?: 0.0173 

Chosen token: (next_token)

next token OR
Step #1 candidates:
▁or: 0.5252 
<0x0A>: 0.1801 
</: 0.0825 
▁OR: 0.0489 
</s>: 0.0485 

Chosen token: (next_token)



## Inference on the Entire Dataset

In [24]:
test_data = data.sample(frac=1, random_state=42)

In [25]:
SYSTEM_PROMPT = "You are the assistant for text classification"
INSTRUCTION = "You will receive a TEXT, and you should answer 'YES' or 'NO' to the question: 'Is it possible to determine a common topic for the TEXT or at least for the most part of the TEXT?'. Please, make sure you to only return YES or NO and nothing more."

SAMPLE = test_data.iloc[785]["task"]

print(SAMPLE)

res = few_shot_pipeline(model, SYSTEM_PROMPT, INSTRUCTION, SAMPLE, device="cuda", n_token=30)

res

firearm weapon bill crime vote public laws handgun defense criminal washington federal member carry unite


'NO. There is no common topic that can be determined for the majority of the text. The text mentions various topics such as firearms,'

In [None]:
# Create file
RES_FILE_NAME = "data_qality_estim_llama_13b_4bit_type1.csv"
RES_FILE = DATA_DIR.joinpath(RES_FILE_NAME)

# Remove old file
if os.path.exists(RES_FILE):
    os.remove(RES_FILE)

for i in tqdm(len(test_data)):
    SAMPLE = test_data.iloc[i]["task"]
    res_true = test_data.iloc[i]["agg_label"]
    res_model = few_shot_pipeline(
        model,
        SYSTEM_PROMPT,
        INSTRUCTION,
        SAMPLE,
        device="cuda",
        temp=0.1,
        n_token=30
    )
    with open(RES_FILE, 'a') as file:
        file.write(f"{SAMPLE};{res_true};{res_model}\n")


100%|██████████| 3076/3076 [1:26:57<00:00,  1.70s/it]
