### Download llama model

In [1]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
    filename="Meta-Llama-3-8B-Instruct.Q6_K.gguf",
    cache_dir="llm_cache",
)

llm.create_chat_completion(
    messages=[{"role": "user", "content": "What is the capital of France?"}]
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from llm_cache/models--QuantFactory--Meta-Llama-3-8B-Instruct-GGUF/snapshots/33b3a2a0f06a820b6306ab3aa2020ecb6bcf22da/./Meta-Llama-3-8B-Instruct.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attenti

{'id': 'chatcmpl-6048bd52-5974-4d41-bc51-3dbbe8a2b9eb',
 'object': 'chat.completion',
 'created': 1724256030,
 'model': 'llm_cache/models--QuantFactory--Meta-Llama-3-8B-Instruct-GGUF/snapshots/33b3a2a0f06a820b6306ab3aa2020ecb6bcf22da/./Meta-Llama-3-8B-Instruct.Q6_K.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'The capital of France is Paris.'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 17, 'completion_tokens': 7, 'total_tokens': 24}}

### Try Guidence

In [2]:
from guidance import models, gen, select

In [3]:
# load a model (could be Transformers, LlamaCpp, VertexAI, OpenAI...)
lm = models.LlamaCpp(
    "llm_cache/models--QuantFactory--Meta-Llama-3-8B-Instruct-GGUF/snapshots/33b3a2a0f06a820b6306ab3aa2020ecb6bcf22da/Meta-Llama-3-8B-Instruct.Q6_K.gguf",
    n_gpu_layers=-1,
    n_ctx=4096,
)

In [40]:
prompt = """This product: {product}\n\n is best categorized as: """

In [5]:
(
    lm
    + prompt.format(product="toilet brush")
    + select(
        ["sports equipment", "home", "food", "electronics", "art"], name="category"
    )
)

### Test on known products

In [6]:
import pandas as pd

In [7]:
df = pd.read_json(
    "c4-raw-meta-filtered_2024-Aug-20_20-44-50/sampled_item_metadata_1M_filtered.jsonl",
    lines=True,
)

In [86]:
df.sample(100).to_csv("test_categories_sample.csv")

In [44]:
df = pd.read_csv("test_categories_sample.csv", index_col=[0])

In [45]:
df = df.dropna()

In [46]:
categories = df["file_name"].unique().tolist()

In [47]:
categories

['Home & Kitchen',
 'Health & Household',
 'Clothing Shoes & Jewelry',
 'Tools & Home Improvement',
 'Books',
 'Toys & Games',
 'Beauty & Personal Care',
 'Electronics',
 'Pet Supplies',
 'Grocery & Gourmet Food',
 'Arts Crafts & Sewing',
 'Sports & Outdoors',
 'Automotive',
 'Office Products',
 'Patio Lawn & Garden']

In [48]:
len(df)

60

In [13]:
df.head()

Unnamed: 0,item_id,title,description,file_name
376666,B0073RM9PC,Focus Foodservice Commercial Bakeware 17 by 25...,Focus Foodservice LLC is committed to providin...,Home & Kitchen
761113,B0BPXBGP61,"Sanrio Kuromi Lighter, Kawaii Hello Kitty Anim...",❗️PLEASE NOTE: Due to strict transportation re...,Health & Household
174091,B000I0UW0K,Pleaser Women's 7 inch Sandal (Black/Glitter;7),"TIPJAR-709-5\n7"" Heel\nSize: 5-14\n7"" (178mm) ...",Clothing Shoes & Jewelry
460099,B00ECJ8JBM,Sanrio Hello Kitty Blackout Window Panel Drape...,Sanrio Hello Kitty Blackout Window Panel Drape...,Home & Kitchen
666812,B0141PFHY8,3dRose lsp_217993_1 Gold Butterflies and Flour...,Gold butterflies and flourishes on a two-tone ...,Tools & Home Improvement


In [14]:
prompt = """
You are an AI model trained to assist in categorizing products in an e-commerce dataset. 
Your task is to assign the most appropriate category to each product based on its description, title, and other available attributes. 
Follow these guidelines:

Understand the Product:

Carefully read the product's title, description, and any additional details (e.g., brand, material, color).
Consider the primary use or function of the product.
Select the Most Accurate Category:

Choose the category that best matches the product's main purpose.
If a product could fit into multiple categories, prioritize the category that a typical customer would most likely search under.
Handle Ambiguities:

If the product information is vague or incomplete, select the category that seems most appropriate based on the available details.
If a product fits equally into two categories, choose the broader or more general category.
Consistency:

Apply the same reasoning across similar products to ensure consistency in categorization.
Special Considerations:

Be mindful of products that may have specific subcategories (e.g., electronics, apparel, home goods).
Consider seasonal or contextual relevance (e.g., holiday decorations vs. home decor).

This product: {product}\n\n is best categorized as: """

In [15]:
index = 376666
out = (
    lm
    + prompt.format(product="\n".join([df["title"][index], df["description"][index]]))
    + select(categories, name="category")
)["category"]
print(out)

Home & Kitchen


In [41]:
def get_category(title: str, description: str):
    CATEGORIES = [
        "Home & Kitchen",
        "Health & Household",
        "Clothing Shoes & Jewelry",
        "Tools & Home Improvement",
        "Books",
        "Toys & Games",
        "Beauty & Personal Care",
        "Electronics",
        "Pet Supplies",
        "Grocery & Gourmet Food",
        "Arts Crafts & Sewing",
        "Sports & Outdoors",
        "Automotive",
        "Office Products",
        "Patio Lawn & Garden",
    ]
    return (
        lm
        + prompt.format(product="\n".join([title, description]))
        + select(CATEGORIES, name="category")
    )["category"]

In [49]:
from tqdm.auto import tqdm

In [50]:
results = []
for index, item_id, title, description, file_name in tqdm(
    df.itertuples(), total=len(df)
):
    predicted_category = get_category(title, description)
    results.append(
        {"item_id": item_id, "expected": file_name, "predicted": predicted_category}
    )

In [51]:
df["category"] = [result["predicted"] for result in results]

In [52]:
df["prediction"] = df["category"] == df["file_name"]

In [53]:
df["prediction"].value_counts()

prediction
True     34
False    26
Name: count, dtype: int64

prediction
True     38
False    22
Name: count, dtype: int64

In [35]:
df.head()

Unnamed: 0,item_id,title,description,file_name,category,prediction
376666,B0073RM9PC,Focus Foodservice Commercial Bakeware 17 by 25...,Focus Foodservice LLC is committed to providin...,Home & Kitchen,Home & Kitchen,True
761113,B0BPXBGP61,"Sanrio Kuromi Lighter, Kawaii Hello Kitty Anim...",❗️PLEASE NOTE: Due to strict transportation re...,Health & Household,Electronics,False
174091,B000I0UW0K,Pleaser Women's 7 inch Sandal (Black/Glitter;7),"TIPJAR-709-5\n7"" Heel\nSize: 5-14\n7"" (178mm) ...",Clothing Shoes & Jewelry,Clothing Shoes & Jewelry,True
460099,B00ECJ8JBM,Sanrio Hello Kitty Blackout Window Panel Drape...,Sanrio Hello Kitty Blackout Window Panel Drape...,Home & Kitchen,Home & Kitchen,True
666812,B0141PFHY8,3dRose lsp_217993_1 Gold Butterflies and Flour...,Gold butterflies and flourishes on a two-tone ...,Tools & Home Improvement,Home & Kitchen,False


Let's check these manually

In [39]:
wrong_ones = df[df["prediction"] == False]

from IPython.display import clear_output

for index, entry in wrong_ones.drop(columns=["prediction"]).iterrows():
    print(entry["title"])
    print(entry["description"])
    print("expected:", entry["file_name"])
    print("predicted", entry["category"])
    input()
    clear_output(wait=True)

ClimaTek Upgraded Furnace Blower Motor fits Rheem 51-22858-01
This is a brand new ClimaTek Furnace Blower Motor
expected: Tools & Home Improvement
predicted Home & Kitchen


KeyboardInterrupt: Interrupted by user

In [60]:
results

[{'expected': 'Home & Kitchen', 'predicted': 'Toys & Games'},
 {'expected': 'Movies & TV', 'predicted': 'Sports & Outdoors'},
 {'expected': 'Health & Household', 'predicted': 'Health & Household'},
 {'expected': 'Clothing Shoes & Jewelry', 'predicted': 'Books'},
 {'expected': 'Clothing Shoes & Jewelry',
  'predicted': 'Clothing Shoes & Jewelry'},
 {'expected': 'Health & Household', 'predicted': 'Health & Household'},
 {'expected': 'Beauty & Personal Care', 'predicted': 'Beauty & Personal Care'},
 {'expected': 'Office Products', 'predicted': 'Home & Kitchen'},
 {'expected': 'Grocery & Gourmet Food',
  'predicted': 'Clothing Shoes & Jewelry'},
 {'expected': 'Pet Supplies', 'predicted': 'Pet Supplies'},
 {'expected': 'Pet Supplies', 'predicted': 'Pet Supplies'},
 {'expected': 'Tools & Home Improvement', 'predicted': 'Home & Kitchen'},
 {'expected': 'Home & Kitchen', 'predicted': 'Home & Kitchen'},
 {'expected': 'Baby Products', 'predicted': 'Baby Products'},
 {'expected': 'Home & Kitchen'

In [75]:
n_correct = list(map(lambda x: x["expected"] == x["predicted"], results)).count(True)
n_correct

15