In [1]:

!pip install pandas sentence_transformers
!pip install -U transformers
!pip install hnswlib
# Install below if using GPU
!pip install accelerate

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3
Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l

In [2]:

from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

import pandas as pd
import numpy as np

import hnswlib
import torch

In [5]:

df_purchases = pd.read_csv('data.csv', encoding='unicode_escape')
print("Row Count:",df_purchases.shape[0])
df_purchases.head()

Row Count: 541909


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [6]:

# elimination of NaN values
df_purchases.dropna(inplace=True)
# elimination of duplicate rows
df_purchases.drop_duplicates(inplace=True)
# elimination of cancelled orders
df_purchases = df_purchases[~df_purchases['InvoiceNo'].str.startswith('C')]

In [7]:

# User purchase history
customer_history_dict = df_purchases.groupby("CustomerID")['StockCode'].apply(lambda x: sorted(list(set(x)))).to_dict()

# product to description dictionary
df_product_descriptions = df_purchases[["StockCode", "Description"]]
# Multiple transaction of same products are removed.
df_product_descriptions.drop_duplicates(inplace=True)
# dictionary generation
product_to_description_dict = dict(zip(df_product_descriptions['StockCode'], df_product_descriptions['Description']))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_product_descriptions.drop_duplicates(inplace=True)


In [8]:
def get_previous_purchases(user_id, k=3):
  """Gets previous purchases of the user"""
  product_list = customer_history_dict.get(user_id, [])
  purchase_descriptions = ""
  for i, product in enumerate(product_list[:k]):
    product_description = product_to_description_dict.get(product, "")
    purchase_descriptions += f"{i+1}. {product_description}\n"

  return purchase_descriptions

In [9]:
# Sequence Transformer
embedding_model = SentenceTransformer("thenlper/gte-small")

def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

df_product_descriptions["embedding"] = df_product_descriptions["Description"].apply(get_embedding)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_product_descriptions["embedding"] = df_product_descriptions["Description"].apply(get_embedding)


In [19]:
df_product_descriptions.head()

Unnamed: 0,StockCode,Description,embedding
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,"[-0.055711179971694946, -0.006865353789180517,..."
1,71053,WHITE METAL LANTERN,"[-0.05453517660498619, 0.007003140170127153, 0..."
2,84406B,CREAM CUPID HEARTS COAT HANGER,"[-0.029888639226555824, -0.00969802588224411, ..."
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,"[-0.03226792439818382, 0.024518396705389023, 0..."
4,84029E,RED WOOLLY HOTTIE WHITE HEART.,"[-0.05853551626205444, -0.012758949771523476, ..."


In [10]:
# Embedding model dimension
dim = embedding_model.get_sentence_embedding_dimension()

num_elements = df_product_descriptions.shape[0]
# hnswlib initialization with cosine similarity
p = hnswlib.Index(space='cosine', dim=dim)

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

p.set_ef(10)

embeddings = np.vstack(df_product_descriptions["embedding"].values)
p.add_items(embeddings)

In [11]:
def vector_search(user_query, k):
    """Gets user input query and return top k similar items"""

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."


    labels, distances = p.knn_query(query_embedding, k=k)
    results = df_product_descriptions.iloc[list(labels[0])].to_dict('records')
    return results

In [12]:
def get_search_result(query, k):
    """Aggregate similar product descriptions into one string"""
    get_knowledge = vector_search(query, k)

    search_result = ""
    for i, result in enumerate(get_knowledge):
        search_result += f"{i+1}. {result.get('Description', 'N/A')}\n"

    return search_result

In [13]:

# Gets top k similar products w.r.t provided query
k = 3
query = "lantern"
source_information = get_search_result(query, k)
combined_information = f"Similar Results:\n{source_information}"

print(combined_information)

Similar Results:
1. WHITE METAL LANTERN
2. WHITE MOROCCAN METAL LANTERN
3. FRENCH CARRIAGE LANTERN



In [14]:

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
pip install -U transformers



In [17]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
  model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
else:
  model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [18]:
# User id to check result of personalized recommendation
user_id = 15781

query = "BLUE LEAVES AND BEADS PHONE CHARM"
k = 3
# get similar items
source_information = get_search_result(query, k)
previous_purchases = get_previous_purchases(user_id)

# Providing example prompts (few-shot learning) to get the desired output
example_prompt = f"""Given a customer's 'Previous Purchases', rerank a list of 'Recommended Products' from most to least relevant to the customer's preferences. Only recommend products from latest'Recommended Products' section The relevance should be determined by considering the types and themes of products the customer has bought before.

Example 1:
- User Input:
Previous Purchases:
1. BLUE CALCULATOR RULER
2. DOORMAT TOPIARY
3. PARTY BUNTING
Recommended Products:
1. CRYSTAL FROG PHONE CHARM
2. PINK CRYSTAL SKULL PHONE CHARM
3. BLUE LEAVES AND BEADS PHONE CHARM

- Model Output:
Reranked Recommendations:
1. BLUE LEAVES AND BEADS PHONE CHARM
2. CRYSTAL FROG PHONE CHARM
3. PINK CRYSTAL SKULL PHONE CHARM

Example 2:
- User Input:
Previous Purchases:
1. PANTRY HOOK SPATULA
2. BIRDCAGE DECORATION TEALIGHT HOLDER
3. REGENCY TEA PLATE PINK
Recommended Products:
1. SWEETHEART CAKESTAND 3 TIER
2. CAKESTAND, 3 TIER, LOVEHEART
3. REGENCY CAKESTAND 3 TIER

- Model Output:
Reranked Recommendations:
1. REGENCY CAKESTAND 3 TIER
2. SWEETHEART CAKESTAND 3 TIER
3. CAKESTAND, 3 TIER, LOVEHEART

"""

combined_information = f"""{example_prompt}

Your Turn:
- User Input:
Previous Purchases:
{previous_purchases}
Recommended Products:
{source_information}
- Model Output:
"""

# will be used to extract last prompt
key_text = 'Your Turn:'

# input ids
input_ids = tokenizer(combined_information, return_tensors="pt").to(device)
response = model.generate(**input_ids, max_new_tokens=500)
output_text = tokenizer.decode(response[0])
output_text = output_text[output_text.index(key_text) + len(key_text):]

print(f"Query: {query}")
print(output_text)

Query: BLUE LEAVES AND BEADS PHONE CHARM

- User Input:
Previous Purchases:
1. EDWARDIAN PARASOL NATURAL
2. BLUE STRIPE CERAMIC DRAWER KNOB
3. WHITE LOVEBIRD LANTERN

Recommended Products:
1. BLUE LEAVES AND BEADS PHONE CHARM
2. CRYSTAL FROG PHONE CHARM
3. BLUE GLASS CHUNKY CHARM BRACELET

- Model Output:
Reranked Recommendations:
1. BLUE LEAVES AND BEADS PHONE CHARM
2. BLUE GLASS CHUNKY CHARM BRACELET
3. CRYSTAL FROG PHONE CHARM<eos>


In [20]:
# User id to check result of personalized recommendation
user_id = 15781

query = "BAG CHARM"
k = 3
# get similar items
source_information = get_search_result(query, k)
previous_purchases = get_previous_purchases(user_id)

# Providing example prompts (few-shot learning) to get the desired output
example_prompt = f"""Given a customer's 'Previous Purchases', rerank a list of 'Recommended Products' from most to least relevant to the customer's preferences. Only recommend products from latest 'Recommended Products' section The relevance should be determined by considering the types and themes of products the customer has bought before. Also give brief explanation about reranking reason.

Example 1:
- User Input:
Previous Purchases:
1. BLUE CALCULATOR RULER
2. DOORMAT TOPIARY
3. PARTY BUNTING
Recommended Products:
1. CRYSTAL FROG PHONE CHARM
2. PINK CRYSTAL SKULL PHONE CHARM
3. BLUE LEAVES AND BEADS PHONE CHARM

- Model Output:
Reranked Recommendations:
1. BLUE LEAVES AND BEADS PHONE CHARM - Matches blue theme; visually appealing.
2. CRYSTAL FROG PHONE CHARM - Playful, aligns with fun items.
3. PINK CRYSTAL SKULL PHONE CHARM - Decorative, less color relevance noted.

Example 2:
- User Input:
Previous Purchases:
1. PANTRY HOOK SPATULA
2. BIRDCAGE DECORATION TEALIGHT HOLDER
3. REGENCY TEA PLATE PINK
Recommended Products:
1. SWEETHEART CAKESTAND 3 TIER
2. CAKESTAND, 3 TIER, LOVEHEART
3. REGENCY CAKESTAND 3 TIER

- Model Output:
Reranked Recommendations:
1. REGENCY CAKESTAND 3 TIER - Matches Regency style; highly relevant.
2. SWEETHEART CAKESTAND 3 TIER - Elegant, complements table setting decor.
3. CAKESTAND, 3 TIER, LOVEHEART - Decorative, thematic but less specific.
"""

combined_information = f"""{example_prompt}

Your Turn:
- User Input:
Previous Purchases:
{previous_purchases}
Recommended Products:
{source_information}
- Model Output:
"""

# will be used to extract last prompt
key_text = 'Your Turn:'

# input ids
input_ids = tokenizer(combined_information, return_tensors="pt").to(device)
response = model.generate(**input_ids, max_new_tokens=500)
output_text = tokenizer.decode(response[0])
output_text = output_text[output_text.index(key_text) + len(key_text):]

print(f"Query: {query}")
print(output_text)

Query: BAG CHARM

- User Input:
Previous Purchases:
1. EDWARDIAN PARASOL NATURAL
2. BLUE STRIPE CERAMIC DRAWER KNOB
3. WHITE LOVEBIRD LANTERN

Recommended Products:
1. COPPER AND BRASS BAG CHARM
2. IVORY GOLD METAL BAG CHARM
3. WHITE WITH METAL BAG CHARM

- Model Output:
Reranked Recommendations:
1. WHITE WITH METAL BAG CHARM - Matches white theme; complements previous item.
2. COPPER AND BRASS BAG CHARM - Matches Edwardian style; complements previous item.
3. IVORY GOLD METAL BAG CHARM - Less relevant to the customer's previous purchases.<eos>
