In [None]:
#importing relevant libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.subplots as sp
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import skew
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
new_data = pd.read_csv('/content/drive/MyDrive/new-data.csv')

In [None]:
new_data.shape

(630808, 22)

In [None]:
new_data.isna().sum()

Unnamed: 0            0
author_id             0
product_id            0
product_name_x        0
brand_name_x          0
loves_count           0
rating_x              0
reviews               0
size                  0
ingredients           0
price_usd_x           0
highlights            0
primary_category      0
secondary_category    0
tertiary_category     0
is_recommended        0
helpfulness           0
review_text           0
skin_tone             0
eye_color             0
skin_type             0
hair_color            0
dtype: int64

In [None]:
new_data.duplicated().sum()

0

In [None]:
new_data.columns

Index(['Unnamed: 0', 'author_id', 'product_id', 'product_name_x',
       'brand_name_x', 'loves_count', 'rating_x', 'reviews', 'size',
       'ingredients', 'price_usd_x', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'is_recommended',
       'helpfulness', 'review_text', 'skin_tone', 'eye_color', 'skin_type',
       'hair_color'],
      dtype='object')

In [None]:
new_data = new_data.drop(['Unnamed: 0'], axis=1)

In [None]:
new_data.shape

(630808, 21)

### Data Preprocessing

In [None]:
new_data_1= new_data.drop(['product_name_x','author_id','size', 'ingredients', 'highlights', 'primary_category',
                           'secondary_category', 'review_text','brand_name_x', 'tertiary_category'], axis=1)

In [None]:
new_data_1.shape

(630808, 11)

### OHE

In [None]:
# Categorical columns to one-hot encode
categorical_cols = ['product_id', 'skin_tone', 'eye_color', 'hair_color', 'skin_type']
df_encoded = pd.get_dummies(new_data_1, columns=categorical_cols)

In [None]:
df_encoded.shape

(630808, 1554)

### Checking for Skewness & Log Transformation

In [None]:
def check_skewness(data):
    skewness = {}
    interpretation = {}
    for column in data.columns:
        # Convert the column data to a NumPy array
        column_data = np.array(data[column])
        # Calculate the skewness of the column data
        column_skewness = skew(column_data)
        skewness[column] = column_skewness
        # Interpret the skewness value
        if column_skewness < -1:
            interpretation[column] = "Highly left-skewed"
        elif -1 <= column_skewness < -0.5:
            interpretation[column] = "Moderately left-skewed"
        elif -0.5 <= column_skewness <= 0.5:
            interpretation[column] = "Approximately symmetric"
        elif 0.5 < column_skewness < 1:
            interpretation[column] = "Moderately right-skewed"
        else:
            interpretation[column] = "Highly right-skewed"
    return skewness, interpretation

columns = ["loves_count", "rating_x", "reviews", "price_usd_x"]
skewness, interpretation = check_skewness(df_encoded[columns])
for column in columns:
    print("Skewness for", column, ":", skewness[column])
    print("Interpretation for", column, ":", interpretation[column])
    print()

Skewness for loves_count : 2.3711238911442045
Interpretation for loves_count : Highly right-skewed

Skewness for rating_x : -0.9422155184528447
Interpretation for rating_x : Moderately left-skewed

Skewness for reviews : 1.5276858861870186
Interpretation for reviews : Highly right-skewed

Skewness for price_usd_x : 3.4259434143105993
Interpretation for price_usd_x : Highly right-skewed



In [None]:
# Create a new DataFrame for log-transformed values

# Perform log transformation for each column
for column in ['loves_count','price_usd_x' ]:
    df_encoded[column] = np.log(df_encoded[column])

### Scaling the data

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Numerical columns to scale the data
numerical_cols = ['loves_count', 'price_usd_x', 'rating_x']

# Clip the values to a reasonable range
clip_min = -1e9  # Define the minimum value to clip
clip_max = 1e9   # Define the maximum value to clip
df_encoded[numerical_cols] = np.clip(df_encoded[numerical_cols], clip_min, clip_max)

# Scale the data
scaler = MinMaxScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

### Nearest Neighbours

In [None]:
# create a mapping from unique author IDs and product names to indices
author_ids = new_data['author_id'].unique()
author_id_to_index = {id: index for index, id in enumerate(author_ids)}
product_names = new_data['product_name_x'].unique()
product_name_to_index = {id: index for index, id in enumerate(product_names)}
# get the indices for the sparse matrix
row_indices = np.array([author_id_to_index[id] for id in new_data['author_id']])
col_indices = np.array([product_name_to_index[id] for id in new_data['product_name_x']])
ratings = np.array(new_data['rating_x'])
# create the sparse matrix
sparse_matrix = csr_matrix((ratings, (row_indices, col_indices)), shape=(len(author_ids), len(product_names)))

In [None]:
sparse_matrix

<364274x1513 sparse matrix of type '<class 'numpy.float64'>'
	with 629695 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.neighbors import NearestNeighbors
# instantiating Nearest Neighbors
nearest_neighbor_model = NearestNeighbors(metric='cosine', algorithm='brute')

#fitting the model to the sparse matrix
model = nearest_neighbor_model.fit(sparse_matrix)

In [None]:
def recommend_product(product_name):
    # Finding the index of the product in the pivot table
    product_index = product_name_to_index[product_name]

    # Finding the nearest neighbors of the given product
    distances, indices = model.kneighbors(sparse_matrix.getrow(product_index), n_neighbors=5)

    # Creating a list of recommended products
    recommended_products = []
    for index in indices.flatten():
        recommended_products.append(product_names[index])

    return recommended_products


In [None]:
recommend_product('1% Vitamin A Retinol Serum')

['The Moisturizing Matte Lotion',
 'The Eye Balm Intense',
 'The Moisturizing Soft Lotion',
 'The Cleansing Foam',
 'The Lifting and Firming Mask']

### Model 2

In [None]:
df_encoded.shape

(630808, 1554)

In [None]:
sample_size = len(df_encoded) // 4  # Calculate the sample size

sample_1 = df_encoded.sample(n=sample_size, random_state=42)  # First sample
remaining_df = df_encoded.drop(sample_1.index)  # Remove the selected rows from the DataFrame

sample_2 = remaining_df.sample(n=sample_size, random_state=42)  # Second sample
remaining_df = remaining_df.drop(sample_2.index)

sample_3 = remaining_df.sample(n=sample_size, random_state=42)  # Third sample
remaining_df = remaining_df.drop(sample_3.index)

sample_4 = remaining_df.sample(n=sample_size, random_state=42)  # Fourth sample

In [None]:
sample_1.shape

(157702, 1554)

In [None]:
sample_2.shape

(157702, 1554)

In [None]:
sample_3.shape

(157702, 1554)

In [None]:
sample_4.shape

(157702, 1554)

In [None]:
from sklearn.neighbors import NearestNeighbors
# instantiating Nearest Neighbors
nearest_neighbor_model = NearestNeighbors(metric='cosine', algorithm='brute')

#fitting the model to the sparse matrix
model = nearest_neighbor_model.fit(sample_1)