## Fetching Data from CSV

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import openai
import json
from tqdm import tqdm
from dotenv import load_dotenv
import os

In [2]:
data = pd.read_json('./raw_data/JSON/Amazon_full_data.json')
data.head()

Unnamed: 0,Data ID,Product URL,Brand Name,Product Name,Rating,Rating Count,Price (INR),Original Price (INR),Discount,Badge,Date of Extraction,Product Details,About This Item,All Bullet Points,Additional Details,Brand Snapshot,Product and Seller Details,Product Description
0,B0DCZ25LBC,https://www.amazon.in/Nermosa-Block-Printed-Ku...,Nermosa,Women Cotton Block Printed Kurta Pant With Dup...,4.1 out of 5 stars,1438,₹599.0,₹4999.0,88% off,Amazon's,2025-06-05 15:44:42,"{'Material composition': '100% Viscose', 'Leng...",{'Style Info': '★ Style :- Straight Kurta || S...,[★ Fit Type: Straight; Ethnic Sets : Straight ...,"{'Manufacturer': 'Nermosa Fashion', 'Packer': ...","{'Brand Name': 'Nermosa', 'Top Brand Heading':...",{},
1,B0D2TZDWH5,https://www.amazon.in/Yashika-Fashionably-Trad...,Yashika,"Women's Kurta Fashionably Traditional, Comfort...",3.7 out of 5 stars,476,₹649.0,₹2599.0,75% off,,2025-06-05 15:44:42,"{'Material composition': '100% Cotton Blend', ...",{'Style Info': 'Quality Craftsmanship: Crafted...,[Elegant Ensemble: A curated trio comprising a...,"{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'Yashika', 'Top Brand Heading':...",{},Introducing Yashika Women's Ethnic Trio: a del...
2,B0DG5XZRZM,https://www.amazon.in/KLOSIA-Women-Printed-Kur...,KLOSIA,Women Printed Kurta and Pant Set with Dupatta,4.0 out of 5 stars,750,₹699.0,₹4999.0,86% off,,2025-06-05 15:44:42,"{'Material composition': '100% Viscose', 'Slee...",{'Style Info': 'Style :- Anarkali Kurta || Sle...,[Fit Type: Straight; Ethnic Sets: Anarkali Kur...,"{'Manufacturer': 'KLOSIA', 'Packer': 'Klosia E...","{'Brand Name': 'KLOSIA', 'Top Brand Heading': ...",{},
3,B0DDXX1HS5,https://www.amazon.in/MEERA-FAB-Straight-Print...,MEERA FAB,Women's Cotton Blend Straight Printed Kurta wi...,3.5 out of 5 stars,98,₹789.0,₹2999.0,74% off,,2025-06-05 15:44:42,"{'Material composition': 'PURE COTTON', 'Sleev...","{'Fabric Info': 'Fabric Kurta :- Cotton , Pala...","[Size Chart- Kurta- XS- 34 in , S-36 in , M-38...","{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'MEERA FAB', 'Top Brand Heading...",{},
4,B0DQCL9XGC,https://www.amazon.in/Rangnavi-Anarkali-Printe...,Rangnavi,Women's Cotton Anarkali Printed Kurta with Pal...,3.4 out of 5 stars,145,₹798.0,₹2499.0,68% off,,2025-06-05 15:44:42,"{'Material composition': 'Pure', 'Sleeve type'...","{'Size Chart': 'Size Chart- Kurta- XS- 34 in ,...","[Size Chart- Kurta- XS- 34 in , S-36 in , M-38...","{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'Rangnavi', 'Top Brand Heading'...",{},


## STEP 1: Dropping a few columns
#### Note: Only specific for this data, could be optional for other sets

In [3]:
# Specify the columns you want to drop
columns_to_drop = [
    'Badge', 
    'Date of Extraction', 
    'Product and Seller Details'
]

# Drop them in place (modify df)
data.drop(columns=columns_to_drop, inplace=True)

len(data)

1930

## STEP 2: Handling Missing Values

In [4]:
# Check for missing values
print(data.isnull().sum())

Data ID                 0
Product URL             0
Brand Name              0
Product Name            0
Rating                  0
Rating Count            0
Price (INR)             0
Original Price (INR)    0
Discount                0
Product Details         0
About This Item         0
All Bullet Points       0
Additional Details      0
Brand Snapshot          0
Product Description     0
dtype: int64


In [5]:
# Fill or drop missing values depending on context:
# - For text columns, fill with an empty string
# - For numerical columns, fill with 0 or median

text_columns = ['Brand Name','Product Name', 'Product Details', 'About This Item',
                'All Bullet Points', 'Additional Details', 
                'Brand Snapshot', 
                'Product Description']
for col in text_columns:
    data[col] = data[col].fillna('')

# For numerical columns:
num_columns = ['Rating', 'Rating Count', 'Price (INR)', 
               'Original Price (INR)', 'Discount']
for col in num_columns:
    data[col] = data[col].fillna(0)

In [6]:
data.head()
len(data)

1930

## STEP 3: Clean Text Columns

In [7]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', str(text))  # Remove extra whitespace
    text = text.strip()  # Remove leading/trailing spaces
    return text

for col in text_columns:
    data[col] = data[col].apply(clean_text)

## STEP 4: Encode Categorical Columns

In [8]:
le_brand = LabelEncoder()
data['Brand Name'] = le_brand.fit_transform(data['Brand Name'].astype(str))

## STEP 5: Removing Duplicates


In [9]:
# Check number of duplicate rows
print("Number of duplicate rows:", data.duplicated().sum())


Number of duplicate rows: 0


## STEP 5: Save Preprocessed data


In [10]:
data.to_csv("./preprocessed_data/amazon_preproc_data_1930.csv", index=False)


In [11]:
data.head()

Unnamed: 0,Data ID,Product URL,Brand Name,Product Name,Rating,Rating Count,Price (INR),Original Price (INR),Discount,Product Details,About This Item,All Bullet Points,Additional Details,Brand Snapshot,Product Description
0,B0DCZ25LBC,https://www.amazon.in/Nermosa-Block-Printed-Ku...,139,Women Cotton Block Printed Kurta Pant With Dup...,4.1 out of 5 stars,1438,₹599.0,₹4999.0,88% off,"{'Material composition': '100% Viscose', 'Leng...",{'Style Info': '★ Style :- Straight Kurta || S...,['★ Fit Type: Straight; Ethnic Sets : Straight...,"{'Manufacturer': 'Nermosa Fashion', 'Packer': ...","{'Brand Name': 'Nermosa', 'Top Brand Heading':...",
1,B0D2TZDWH5,https://www.amazon.in/Yashika-Fashionably-Trad...,211,"Women's Kurta Fashionably Traditional, Comfort...",3.7 out of 5 stars,476,₹649.0,₹2599.0,75% off,"{'Material composition': '100% Cotton Blend', ...",{'Style Info': 'Quality Craftsmanship: Crafted...,['Elegant Ensemble: A curated trio comprising ...,"{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'Yashika', 'Top Brand Heading':...",Introducing Yashika Women's Ethnic Trio: a del...
2,B0DG5XZRZM,https://www.amazon.in/KLOSIA-Women-Printed-Kur...,99,Women Printed Kurta and Pant Set with Dupatta,4.0 out of 5 stars,750,₹699.0,₹4999.0,86% off,"{'Material composition': '100% Viscose', 'Slee...",{'Style Info': 'Style :- Anarkali Kurta || Sle...,['Fit Type: Straight; Ethnic Sets: Anarkali Ku...,"{'Manufacturer': 'KLOSIA', 'Packer': 'Klosia E...","{'Brand Name': 'KLOSIA', 'Top Brand Heading': ...",
3,B0DDXX1HS5,https://www.amazon.in/MEERA-FAB-Straight-Print...,119,Women's Cotton Blend Straight Printed Kurta wi...,3.5 out of 5 stars,98,₹789.0,₹2999.0,74% off,"{'Material composition': 'PURE COTTON', 'Sleev...","{'Fabric Info': 'Fabric Kurta :- Cotton , Pala...","['Size Chart- Kurta- XS- 34 in , S-36 in , M-3...","{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'MEERA FAB', 'Top Brand Heading...",
4,B0DQCL9XGC,https://www.amazon.in/Rangnavi-Anarkali-Printe...,160,Women's Cotton Anarkali Printed Kurta with Pal...,3.4 out of 5 stars,145,₹798.0,₹2499.0,68% off,"{'Material composition': 'Pure', 'Sleeve type'...","{'Size Chart': 'Size Chart- Kurta- XS- 34 in ,...","['Size Chart- Kurta- XS- 34 in , S-36 in , M-3...","{'Country of Origin': 'India', 'Manufacturer':...","{'Brand Name': 'Rangnavi', 'Top Brand Heading'...",


## STEP 6: Loading API_Key

In [12]:
# Load .env variables
load_dotenv()

# Access the key
openai_api_key = os.getenv('OPENAI_API_KEY')

print("Loaded API Key:", openai_api_key[:4] + "****")  # print partial key to confirm

# Now set the key for openai
openai.api_key = openai_api_key

Loaded API Key: sk-p****


## STEP 7: Creating Embeddings

In [13]:
clean_data = pd.read_csv('./preprocessed_data/amazon_preproc_data_1930.csv')
clean_data.head()
len(clean_data)

1930

In [14]:
text_columns = [
    'Product Name',
    'Product Details',
    'About This Item',
    'All Bullet Points',
    'Additional Details',
    'Product Description'
]

def concat_text(row):
    texts = [str(row[col]) for col in text_columns if pd.notna(row[col])]
    return " ".join(texts)

clean_data['combined_text'] = clean_data.apply(concat_text, axis=1)

In [15]:
clean_data.head()['combined_text'][0]

"Women Cotton Block Printed Kurta Pant With Dupatta {'Material composition': '100% Viscose', 'Length': 'Calf Length', 'Sleeve type': '3/4 Sleeve', 'Neck style': 'Round Neck', 'Style': 'Regular', 'Material type': 'Cotton', 'Manufacturer': 'Nermosa Fashion', 'Packer': 'Nermosa Fashion', 'Item Weight': '300 g', 'Item Dimensions LxWxH': '30 x 10 x 3 Centimeters', 'Generic Name': 'Kurta Set'} {'Style Info': '★ Style :- Straight Kurta || Sleeve Length :- 3/4 Sleeve || Bottom :- Pant || Dupatta :- Chanderi Cotton ||', 'Length Info': '★ Style :- Straight Kurta || Sleeve Length :- 3/4 Sleeve || Bottom :- Pant || Dupatta :- Chanderi Cotton ||', 'Occasion / Usage': '★ Item Length :- Casual || Occasion :- Casual ,Office wear ,Party ,Wedding , Ragular & Festive'} ['★ Fit Type: Straight; Ethnic Sets : Straight Kurta and pant Set with Dupatta', '★ Product Material :- Viscose || Colour :- White|| Pattern :- Block Printed ||', '★ Style :- Straight Kurta || Sleeve Length :- 3/4 Sleeve || Bottom :- Pant 

In [18]:
# Metadata columns to keep
metadata_columns = [
    'Product Name',
    'Brand Name',
    'Rating',
    'Rating Count',
    'Price (INR)',
    'Original Price (INR)',
    'Discount'
]

# Function to get embedding from OpenAI
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding
    
# Generate embeddings for all rows
embeddings = []
for text in tqdm(clean_data['combined_text'], desc="Generating Embeddings"):
    emb = get_embedding(text)
    embeddings.append(emb)

clean_data['embedding'] = embeddings

Generating Embeddings:  11%|█▉               | 214/1930 [03:49<30:37,  1.07s/it]


KeyboardInterrupt: 

In [20]:
import json
import datetime

metadata_columns

# Prepare data to save
output_data = []
for _, row in clean_data.iterrows():
    item = {
        'embedding': row['embedding'],
        'metadata': {col: row[col] for col in metadata_columns},
        'createdAt': datetime.datetime.utcnow().isoformat() + 'Z'  # ISO 8601 format with UTC indicator
    }
    output_data.append(item)

# Save to JSON file
with open('./product_embeddings/product_embeddings2.json', 'w') as f:
    json.dump(output_data, f, indent=4)  # Optional: pretty print

print("Embeddings, metadata, and createdAt saved to product_embeddings2.json")


  'createdAt': datetime.datetime.utcnow().isoformat() + 'Z'  # ISO 8601 format with UTC indicator


Embeddings, metadata, and createdAt saved to product_embeddings2.json


In [21]:
import json

with open('./product_embeddings/product_embeddings2.json', 'r') as f:
    data = json.load(f)

# Print the first item for a quick check
print(data[0])


{'embedding': [-0.02540482021868229, -0.004101242404431105, -0.00835502427071333, 0.04687134176492691, 0.026125919073820114, 0.0019327494082972407, -0.0427388958632946, -0.0034685484133660793, -0.002698915544897318, 0.008687838912010193, 0.0413799062371254, -0.05252918228507042, 0.021022766828536987, -0.07943166792392731, -0.018082907423377037, 0.015850279480218887, 0.008611569181084633, 0.01489343773573637, 0.026528069749474525, 0.017015129327774048, 0.009429736994206905, 0.036997850984334946, 0.0027803857810795307, -0.0008424360421486199, 0.023491138592362404, 0.014948906376957893, -0.029259920120239258, -0.005342361982911825, 0.010192436166107655, 0.0035534854978322983, 0.014123804867267609, 0.0004398521559778601, 0.010830330662429333, 0.010192436166107655, -0.07571524381637573, 0.01655750907957554, 0.01060845423489809, 0.04093615338206291, 0.037885356694459915, -0.04845220968127251, -0.02550189197063446, 0.005869317799806595, -0.026222988963127136, 0.00018016605463344604, -0.008576