In [2]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
import pandas as pd
import re
import html

In [3]:
CSV_PATH = r"C:\Users\kau75421\LLMprojects\Marketing_campaginer\Recommender_Systems\data\inputs\walmart-products.csv"
EMBEDDING_MODEL = 'BAAI/bge-small-en-v1.5'
BATCH_SIZE = 64
FAISS_INDEX_PATH = "faiss.index"
CHUNKS_MAPPING_PATH = "faiss_data.pkl"
NORMALIZE = True

In [4]:
df = pd.read_csv(CSV_PATH)
df.fillna("", inplace=True)

# Drop purely technical columns (non-semantic fields)
non_semantic_fields = [
    'product_id', 'sku', 'image_url', 'asin', 'upc', 'model_number',
    'internal_id', 'timestamp', 'created_at', 'updated_at'
]
existing_to_drop = [col for col in non_semantic_fields if col in df.columns]
df.drop(columns=existing_to_drop, inplace=True)

  df.fillna("", inplace=True)


In [5]:
def clean_text(text):
    text = str(text).lower()
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for col in df.columns:
    df[col] = df[col].apply(clean_text)

In [6]:
def row_to_chunks(row):
    chunks = []
    for col in df.columns:
        value = str(row[col]).strip()
        if value:
            chunks.append(f"{col}: {value}")
    return chunks

df['chunks'] = df.apply(row_to_chunks, axis=1)
df['chunks']

0      [url: https://www.walmart.com/ip/laura-mercier...
1      [url: https://www.walmart.com/ip/exultantex-gr...
2      [url: https://www.walmart.com/ip/jessica-londo...
3      [url: https://www.walmart.com/ip/100-cotton-ki...
4      [url: https://www.walmart.com/ip/disney-boys-g...
                             ...                        
995    [url: https://www.walmart.com/ip/great-value-s...
996    [url: https://www.walmart.com/ip/thelovely-men...
997    [url: https://www.walmart.com/ip/alpine-swiss-...
998    [url: https://www.walmart.com/ip/sunnycome-ret...
999    [url: https://www.walmart.com/ip/summer-toddle...
Name: chunks, Length: 1000, dtype: object

In [7]:
# Flatten all chunks
all_chunks = []
for chunk_list in df['chunks'].tolist():
    all_chunks.extend(chunk_list)
all_chunks

['url: https://www.walmart.com/ip/laura-mercier-caviar-stick-eye-color-sugar-frost-1-64g-0-05oz/173530386',
 'final_price: 22.9',
 'currency: usd',
 'gtin: 736150109415.0',
 'specifications: [{"name":"brand","value":"laura mercier"},{"name":"assembled product dimensions (l x w x h)","value":"0.20 x 0.20 x 5.10 inches"}]',
 'image_urls: ["https://i5.walmartimages.com/seo/laura-mercier-caviar-stick-eye-color-sugar-frost-1-64g-0-05oz_55297223-7af5-4a30-83c2-4d74d08969e3.8ca12dd7578ff564b6e01923e85ffd11.jpeg","https://i5.walmartimages.com/asr/75b494dc-66b5-420e-829f-7d27ef56ebe0.4a47ae19dbd3851a88a2174d3c736d78.jpeg","https://i5.walmartimages.com/asr/96df8a6b-c641-44d0-bf1f-4433f1b3140a.4b1b5bfc9c015d738cee5cd6ad9b79f3.jpeg"]',
 'top_reviews: {"negative":{},"positive":{}}',
 'rating_stars: {"five_stars":2,"four_stars":3,"two_stars":1}',
 'related_pages: ["https://www.walmart.com/browse/premium-beauty/stila-eyeshadow/7924299_9882925_9097420_5183244","https://www.walmart.com/browse/beauty/bl

In [8]:
def generate_embeddings(chunks, model_name, normalize=True, batch_size=64):
    model = SentenceTransformer(model_name)
    all_embeddings = []

    for i in tqdm(range(0, len(chunks), batch_size), desc=f"Generating Embeddings with {model_name}"):
        batch = chunks[i:i + batch_size]
        embeddings = model.encode(
            batch,
            batch_size=batch_size,
            convert_to_numpy=True,
            normalize_embeddings=normalize,
            show_progress_bar=False
        )
        all_embeddings.append(embeddings)

    return np.vstack(all_embeddings).astype("float32")


Generating Embeddings:  20%|██        | 111/546 [07:38<29:57,  4.13s/it]


KeyboardInterrupt: 

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

# Batch insertion into FAISS for efficiency
for i in tqdm(range(0, len(embeddings), 10000), desc="Adding to FAISS"):
    batch_embeddings = embeddings[i:i+10000]
    index.add(batch_embeddings)

# Save FAISS index
faiss.write_index(index, FAISS_INDEX_PATH)

# Save chunk mapping
with open(CHUNKS_MAPPING_PATH, "wb") as f:
    pickle.dump(all_chunks, f)

print("FAISS index and mapping saved successfully.")

Adding to FAISS: 100%|██████████| 4/4 [00:00<00:00, 101.02it/s]

FAISS index and mapping saved successfully.





In [None]:
index = faiss.read_index(FAISS_INDEX_PATH)

with open(CHUNKS_MAPPING_PATH, "rb") as f:
    chunk_mapping = pickle.load(f)

In [None]:
# Example query
query = "I want pants cheaper than 10 dollars with XL size and blue color"
instruction_query = f"Retrieve product info: {query}"

# Embed query
query_embedding = model.encode(
    [instruction_query],
    normalize_embeddings=NORMALIZE
).astype("float32")

# Search
k = 10
distances, indices = index.search(query_embedding, k)

# Show results
for i in range(k):
    idx = indices[0][i]
    print(f"Result {i+1}:")
    print(f"Chunk: {chunk_mapping[idx]}")
    print(f"Score: {distances[0][i]}")
    print("-" * 30)

Result 1:
Chunk: category_name: mens pants
Score: 0.7701139450073242
------------------------------
Result 2:
Chunk: category_name: mens pants
Score: 0.7701139450073242
------------------------------
Result 3:
Chunk: category_name: mens pants
Score: 0.7701139450073242
------------------------------
Result 4:
Chunk: category_name: mens pants
Score: 0.7701139450073242
------------------------------
Result 5:
Chunk: category_url: https://www.walmart.com/browse/clothing/mens-pants/5438_133197_6041621_1224675
Score: 0.7687225341796875
------------------------------
Result 6:
Chunk: category_url: https://www.walmart.com/browse/clothing/mens-pants/5438_133197_6041621_1224675
Score: 0.7687225341796875
------------------------------
Result 7:
Chunk: category_url: https://www.walmart.com/browse/clothing/mens-pants/5438_133197_6041621_1224675
Score: 0.7687225341796875
------------------------------
Result 8:
Chunk: category_url: https://www.walmart.com/browse/clothing/mens-pants/5438_133197_60416