In [1]:
import os
from pathlib import Path

import torch
from gpt_reranker import run_reranker
from openai import OpenAI
from prompts import AMAZON_RANKING_PROMPT
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

GPT Reranker - Cross Encoder

In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [3]:
product_text = """
mCover Case Compatible ONLY for 2021～2023 14" Dell Latitude 5420 5430 Windows Notebook Computer (NOT Fitting Any Other Dell Models) - Pink

product category: Electronics
"""
query = "apple laptop case"

In [28]:
product_text = """
YMIX Macbook Pro 13" Case Non-Retina,Folio Embroidered Shell Plastic Hard Protective Cover for Old MacBook Pro 13 Inch with CD-ROM Drive,Model A1278(A_Embroidered Floral)

product category: Electronics
"""
query = "apple laptop case"

In [4]:
label, probab = run_reranker(
    client=client,
    prompt=AMAZON_RANKING_PROMPT,
    query=query,
    product_text=product_text,
    logger_level="SUCCESS",
)

[32m2024-09-24 16:50:06.676[0m | [32m[1mSUCCESS [0m | [36mgpt_reranker[0m:[36mrun_reranker[0m:[36m77[0m - [32m[1mLabel: No with Probability: 1.0[0m


In [5]:
label, probab = run_reranker(
    client=client,
    prompt=AMAZON_RANKING_PROMPT,
    query=query,
    product_text=product_text,
    logger_level="SUCCESS",
    logit_bias_value=0,
)

[32m2024-09-24 16:50:14.664[0m | [32m[1mSUCCESS [0m | [36mgpt_reranker[0m:[36mrun_reranker[0m:[36m77[0m - [32m[1mLabel: No with Probability: 1.0[0m


## OS - Cross Encoder

In [2]:
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
cache_dir = Path("../cache")
assert cache_dir.exists(), f"Cache directory {cache_dir} does not exist"

### Llama 3.1 8B

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=cache_dir)
quantization_config = QuantoConfig(weights="int8")
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=MODEL_ID,
    quantization_config=quantization_config,
    cache_dir=cache_dir,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Reranking Pipeline

In [4]:
yes_token_id = tokenizer.convert_tokens_to_ids("Yes")
no_token_id = tokenizer.convert_tokens_to_ids("No")
bias_tokens = [yes_token_id, no_token_id]
print(f"ID for Yes: {yes_token_id}, No: {no_token_id}")

ID for Yes: 9642, No: 2822


In [5]:
product_text = """
mCover Case Compatible ONLY for 2021～2023 14" Dell Latitude 5420 5430 Windows Notebook Computer (NOT Fitting Any Other Dell Models) - Pink

product category: Electronics
"""
query = "laptop case"

In [6]:
prompt = AMAZON_RANKING_PROMPT.format(query=query, product_text=product_text)
# outputs = product_relevance(prompt)

In [7]:
def custom_logits_processor(_, logits):

    for token_id in bias_tokens:
        print(f"Logits value for token ID {token_id}: {logits[0, token_id]}")
        logits[0, token_id] += bias_value
    return logits


bias_value = 1
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
prompt_token_num = inputs.input_ids[0].shape[0]

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,
        do_sample=False,  # We're using greedy decoding
        num_beams=1,  # Ensure we're not doing beam search
        temperature=None,
        top_p=None,
        output_scores=True,
        return_dict_in_generate=True,
        # logits_processor=[custom_logits_processor],
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

label_token = outputs[0][0][prompt_token_num:].item()
label = tokenizer.decode(label_token)
print(f"Label: {label}")

scores = outputs.scores[0]
# Maybe do softmax only on yes and no scores
scores = torch.softmax(scores, dim=-1)
yes_score = scores[0, yes_token_id].item()
no_score = scores[0, no_token_id].item()
print(f"Score for Yes: {yes_score}, No: {no_score}")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism

Label: Note
Score for Yes: 0.03128882497549057, No: 0.024755867198109627


In [13]:
torch.softmax(scores[:, [yes_token_id, no_token_id]], dim=-1)

tensor([[0.5016, 0.4984]], device='mps:0')

In [23]:
outputs.scores[0].shape

torch.Size([1, 128256])

Score for Yes: 0.0775807648897171, No: 0.061382267624139786


In [10]:
outputs.scores[0].shape

torch.Size([1, 128256])

### Pipeline

In [13]:
input_ids = tokenizer(
    AMAZON_RANKING_PROMPT.format(query="macbook keyboard", product_text=product_text),
    return_tensors="pt",
).to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=5)
print(tokenizer.decode(outputs[0]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>
You are an Assistant responsible for helping detect whether the retrieved product is relevant to the query. For a given input, you need to output a single token: "Yes" or "No" indicating the retrieved product is relevant to the query.

Query: Younique setting powder
Product: 
"""
Younique Touch Behold Translucent Setting Powder

Touch Behold Translucent Setting Powder. Younique’s Touch Behold Translucent Setting Powder effortlessly locks and loads your look so you’re ready to take on the world. Use as the finishing touch to help keep makeup in place, or wear directly on skin for a softening, matte look.

product category: Beauty & Personal Care
"""
Relevant: Yes

Query: white musk hand cream
Product: 
"""
Braided Hair Clips for Women Girls, Sparkling Crystal Stone Braided Hair Clips Barrette with 3 Small Clips, Triple Hair Clips with Rhinestones for Sectioning,4PCS (4pcs-Type A)

product category: Beauty & Personal Care
"""
Relevant: No

Query: HP Pavilion dm4 replace

In [6]:
yes_token_id = tokenizer.convert_tokens_to_ids("Yes")
no_token_id = tokenizer.convert_tokens_to_ids("No")
print(f"ID for Yes: {yes_token_id}, No: {no_token_id}")

ID for Yes: 9642, No: 2822


In [7]:
input_tensor = tokenizer.encode(
    AMAZON_RANKING_PROMPT.format(query=query, product_text=product_text),
    return_tensors="pt",
).to(model.device)

output = model.forward(input_tensor)
logits = output.logits[0, -1]  # from first batch take last token logits

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [8]:
print(f"Probability for Yes: {torch.softmax(logits, dim=-1)[yes_token_id]}")
print(f"Probability for No: {torch.softmax(logits, dim=-1)[no_token_id]}")

Probability for Yes: 0.010473666712641716
Probability for No: 0.19977031648159027


Score for Yes: 0.010473666712641716, No: 0.19977031648159027


In [17]:
print(f"Probability for Yes: {torch.softmax(logits, dim=-1)[yes_token_id]}")
print(f"Probability for No: {torch.softmax(logits, dim=-1)[no_token_id]}")

Probability for Yes: 0.668552815914154
Probability for No: 0.09124583750963211
