In [39]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

# import numpy as np

In [40]:
product_info_file_path = '../data/product_info.csv'
data = pd.read_csv(product_info_file_path)
data_info = data.info()
data_head = data.head()

# data_info, data_head
data.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                6863 non-null   object 
 8   variation_type      7050 non-null   object 
 9   variation_value     6896 non-null   object 
 10  variation_desc      1250 non-null   object 
 11  ingredients         7549 non-null   object 
 12  price_usd           8494 non-null   float64
 13  value_price_usd     451 non-null    float64
 14  sale_price_usd      270 non-null    float64
 15  limited_edition     8494 non-null   int64  
 16  new   

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')

In [41]:
data

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,...,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.2500,16.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,P467659,Couture Clutch Eyeshadow Palette,1070,Yves Saint Laurent,2790,4.4286,7.0,,,,...,0,0,0,,Makeup,Eye,Eye Palettes,0,,
8490,P500874,L'Homme Eau de Parfum,1070,Yves Saint Laurent,2319,4.6367,556.0,2 oz / 60 mL,Size + Concentration + Formulation,2 oz / 60 mL eau de parfum spray,...,0,0,0,"['Layerable Scent', 'Woody & Earthy Scent']",Fragrance,Men,Cologne,1,129.0,129.0
8491,P504428,Mon Paris Eau de Parfum Gift Set,1070,Yves Saint Laurent,1475,5.0000,2.0,,,,...,1,1,0,,Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
8492,P504448,Y Eau de Parfum Gift Set,1070,Yves Saint Laurent,840,,,,,,...,1,0,0,,Fragrance,Value & Gift Sets,Cologne Gift Sets,0,,


In [42]:
product_info_df = data.copy()
product_info_df.drop(['size', 'variation_type', 'variation_value','variation_desc', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'child_count','child_max_price', 'child_min_price'], axis=1, inplace=True)
product_info_df.rename(columns={'rating': 'overall_rating'}, inplace=True)

# 1. Handle Missing Values
# For numerical columns (e.g., 'rating', 'reviews'), impute with median
numerical_cols = ['overall_rating', 'reviews', 'price_usd']
num_imputer = SimpleImputer(strategy='median')
product_info_df[numerical_cols] = num_imputer.fit_transform(product_info_df[numerical_cols])

In [43]:
# For categorical columns (e.g., 'size', 'variation_type'), impute with most frequent
categorical_cols = ['primary_category', 'secondary_category', 'tertiary_category']
cat_imputer = SimpleImputer(strategy='most_frequent')
product_info_df[categorical_cols] = cat_imputer.fit_transform(product_info_df[categorical_cols])

In [44]:
# # Encoding binary categorical columns
# binary_cols = ['limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive']
# product_info_df[binary_cols] = product_info_df[binary_cols].astype(int)

In [45]:
# Encoding specific categorical columns with Label Encoding
label_encoder_cols = ['brand_name', 'primary_category', 'secondary_category', 'tertiary_category']
label_encoders = {col: LabelEncoder() for col in label_encoder_cols}
for col in label_encoder_cols:
    product_info_df[col] = label_encoders[col].fit_transform(product_info_df[col].astype(str))

In [46]:
# # Feature Engineering
# product_info_df['discount_amount'] = product_info_df['price_usd'] - product_info_df['sale_price_usd']
# product_info_df['discount_amount'].fillna(0, inplace=True)

In [47]:
# Standardizing numerical features
scaler = StandardScaler()
product_info_df[numerical_cols] = scaler.fit_transform(product_info_df[numerical_cols])

In [48]:
# Text Processing: TF-IDF Vectorization for 'highlights' and 'ingredients'
product_info_df['highlights'] = product_info_df['highlights'].fillna('')
product_info_df['ingredients'] = product_info_df['ingredients'].fillna('')

# TF-IDF Vectorization with max 50 features to simplify analysis
tfidf_highlights = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_ingredients = TfidfVectorizer(max_features=50, stop_words='english')

# Fit and transform for each column
highlights_tfidf_matrix = tfidf_highlights.fit_transform(product_info_df['highlights'])
ingredients_tfidf_matrix = tfidf_ingredients.fit_transform(product_info_df['ingredients'])

# Convert TF-IDF matrices to DataFrames
highlights_df = pd.DataFrame(highlights_tfidf_matrix.toarray(), columns=tfidf_highlights.get_feature_names_out())
ingredients_df = pd.DataFrame(ingredients_tfidf_matrix.toarray(), columns=tfidf_ingredients.get_feature_names_out())

In [49]:

# Merge TF-IDF features into processed data
product_info_df.drop(['highlights', 'ingredients'], axis=1, inplace=True)
processed_data = pd.concat([product_info_df.reset_index(drop=True), highlights_df, ingredients_df], axis=1)

# Display a sample of the processed data
processed_data_sample = processed_data.head()
processed_data_sample


Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,overall_rating,reviews,price_usd,primary_category,secondary_category,...,potassium,red,root,seed,silica,sodium,stearate,titanium,tocopherol,water
0,P473671,Fragrance Discovery Set,6342,0,6320,-1.10385,-0.393313,-0.310356,1,38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,P473668,La Habana Eau de Parfum,6342,0,3827,-0.086183,-0.39147,2.671043,1,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P473662,Rainbow Bar Eau de Parfum,6342,0,3253,0.103032,-0.388706,2.671043,1,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P473660,Kasbah Eau de Parfum,6342,0,3018,0.547941,-0.384098,2.671043,1,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P473658,Purple Haze Eau de Parfum,6342,0,2691,-1.901619,-0.39147,2.671043,1,40,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Encode product_name
from sklearn.cluster import KMeans

# Use TF-IDF to vectorize the product names
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(processed_data["product_name"])

# KMeans clustering
kmeans = KMeans(n_clusters=50)
processed_data['product_name'] = kmeans.fit_predict(X)


In [51]:
processed_data

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,overall_rating,reviews,price_usd,primary_category,secondary_category,...,potassium,red,root,seed,silica,sodium,stearate,titanium,tocopherol,water
0,P473671,39,6342,0,6320,-1.103850,-0.393313,-0.310356,1,38,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,P473668,8,6342,0,3827,-0.086183,-0.391470,2.671043,1,40,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,P473662,8,6342,0,3253,0.103032,-0.388706,2.671043,1,40,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,P473660,8,6342,0,3018,0.547941,-0.384098,2.671043,1,40,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,P473658,8,6342,0,2691,-1.901619,-0.391470,2.671043,1,40,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,P467659,30,1070,281,2790,0.454318,-0.396998,1.832525,4,12,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.232352,0.224967,0.000000,0.000000
8490,P500874,8,1070,281,2319,0.863627,0.108858,1.012640,1,25,...,0.0,0.287901,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.139044
8491,P504428,48,1070,281,1475,1.578197,-0.401605,1.534385,1,38,...,0.0,0.222931,0.000000,0.153491,0.0,0.127742,0.000000,0.000000,0.000000,0.107666
8492,P504448,48,1070,281,840,0.180429,-0.291036,2.149298,1,38,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.179353
