In [None]:
#importing relevant libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.subplots as sp
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import skew
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
new_data = pd.read_csv('/content/drive/MyDrive/new-data.csv')

In [None]:
new_data.shape

(630808, 22)

In [None]:
new_data.isna().sum()

Unnamed: 0            0
author_id             0
product_id            0
product_name_x        0
brand_name_x          0
loves_count           0
rating_x              0
reviews               0
size                  0
ingredients           0
price_usd_x           0
highlights            0
primary_category      0
secondary_category    0
tertiary_category     0
is_recommended        0
helpfulness           0
review_text           0
skin_tone             0
eye_color             0
skin_type             0
hair_color            0
dtype: int64

In [None]:
new_data.head()

Unnamed: 0.1,Unnamed: 0,author_id,product_id,product_name_x,brand_name_x,loves_count,rating_x,reviews,size,ingredients,...,primary_category,secondary_category,tertiary_category,is_recommended,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color
0,0,5880814443,P439055,GENIUS Sleeping Collagen Moisturizer,Algenist,33910.0,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...",...,Skincare,Moisturizers,Moisturizers,1.0,1.0,"Ever since I bought this, I noticed that my sk...",medium,brown,oily,black
1,3,8222942765,P439055,GENIUS Sleeping Collagen Moisturizer,Algenist,33910.0,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...",...,Skincare,Moisturizers,Moisturizers,1.0,0.8,I’ve been using this for 2 months now and I ca...,light,brown,combination,blonde
2,4,2403670662,P439055,GENIUS Sleeping Collagen Moisturizer,Algenist,33910.0,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...",...,Skincare,Moisturizers,Moisturizers,0.0,0.111111,I don’t like the smell. It pills on your skin ...,fair,green,combination,blonde
3,6,25102053273,P439055,GENIUS Sleeping Collagen Moisturizer,Algenist,33910.0,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...",...,Skincare,Moisturizers,Moisturizers,1.0,0.833333,Love this product. I use it every night. Goes ...,fair,hazel,combination,brown
4,7,1117898902,P439055,GENIUS Sleeping Collagen Moisturizer,Algenist,33910.0,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...",...,Skincare,Moisturizers,Moisturizers,1.0,0.333333,"Great product. It is so moisturizing. However,...",lightMedium,green,dry,brown


In [None]:
new_data.duplicated().sum()

0

In [None]:
new_data.columns

Index(['Unnamed: 0', 'author_id', 'product_id', 'product_name_x',
       'brand_name_x', 'loves_count', 'rating_x', 'reviews', 'size',
       'ingredients', 'price_usd_x', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'is_recommended',
       'helpfulness', 'review_text', 'skin_tone', 'eye_color', 'skin_type',
       'hair_color'],
      dtype='object')

In [None]:
new_data = new_data.drop(['Unnamed: 0'], axis=1)

In [None]:
new_data.shape

(630808, 21)

### Data Preprocessing

In [None]:
new_data_1= new_data.drop(['product_id','author_id','size', 'ingredients', 'highlights', 'primary_category',
                           'secondary_category', 'review_text','brand_name_x', 'tertiary_category', 'reviews'], axis=1)

In [None]:
new_data_1.shape

(630808, 10)

In [None]:
new_data_1.columns

Index(['product_name_x', 'loves_count', 'rating_x', 'price_usd_x',
       'is_recommended', 'helpfulness', 'skin_tone', 'eye_color', 'skin_type',
       'hair_color'],
      dtype='object')

### OHE

In [None]:
# Categorical columns to one-hot encode
categorical_cols = ['product_name_x', 'skin_tone', 'eye_color', 'hair_color', 'skin_type']
df_encoded = pd.get_dummies(new_data_1, columns=categorical_cols)

In [None]:
df_encoded.shape

(630808, 1549)

### Checking for Skewness & Log Transformation

In [None]:
def check_skewness(data):
    skewness = {}
    interpretation = {}
    for column in data.columns:
        # Convert the column data to a NumPy array
        column_data = np.array(data[column])
        # Calculate the skewness of the column data
        column_skewness = skew(column_data)
        skewness[column] = column_skewness
        # Interpret the skewness value
        if column_skewness < -1:
            interpretation[column] = "Highly left-skewed"
        elif -1 <= column_skewness < -0.5:
            interpretation[column] = "Moderately left-skewed"
        elif -0.5 <= column_skewness <= 0.5:
            interpretation[column] = "Approximately symmetric"
        elif 0.5 < column_skewness < 1:
            interpretation[column] = "Moderately right-skewed"
        else:
            interpretation[column] = "Highly right-skewed"
    return skewness, interpretation

columns = ["loves_count", "rating_x", "price_usd_x"]
skewness, interpretation = check_skewness(df_encoded[columns])
for column in columns:
    print("Skewness for", column, ":", skewness[column])
    print("Interpretation for", column, ":", interpretation[column])
    print()

Skewness for loves_count : 2.3711238911442045
Interpretation for loves_count : Highly right-skewed

Skewness for rating_x : -0.9422155184528447
Interpretation for rating_x : Moderately left-skewed

Skewness for price_usd_x : 3.4259434143105993
Interpretation for price_usd_x : Highly right-skewed



In [None]:
# Create a new DataFrame for log-transformed values

# Perform log transformation for each column
for column in ['loves_count','price_usd_x' ]:
    df_encoded[column] = np.log(df_encoded[column])

### Scaling the data

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Numerical columns to scale the data
numerical_cols = ['loves_count', 'price_usd_x', 'rating_x']

# Clip the values to a reasonable range
clip_min = -1e9  # Define the minimum value to clip
clip_max = 1e9   # Define the maximum value to clip
df_encoded[numerical_cols] = np.clip(df_encoded[numerical_cols], clip_min, clip_max)

# Scale the data
scaler = MinMaxScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

### Nearest Neighbours- Baseline Mode

In [None]:
# create a mapping from unique author IDs and product names to indices
author_ids = new_data['author_id'].unique()
author_id_to_index = {id: index for index, id in enumerate(author_ids)}
product_names = new_data['product_name_x'].unique()
product_name_to_index = {id: index for index, id in enumerate(product_names)}
# get the indices for the sparse matrix
row_indices = np.array([author_id_to_index[id] for id in new_data['author_id']])
col_indices = np.array([product_name_to_index[id] for id in new_data['product_name_x']])
ratings = np.array(new_data['rating_x'])
# create the sparse matrix
sparse_matrix = csr_matrix((ratings, (row_indices, col_indices)), shape=(len(author_ids), len(product_names)))

In [None]:
sparse_matrix

<364274x1513 sparse matrix of type '<class 'numpy.float64'>'
	with 629695 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.neighbors import NearestNeighbors
# instantiating Nearest Neighbors
nearest_neighbor_model = NearestNeighbors(metric='cosine', algorithm='brute')

#fitting the model to the sparse matrix
model = nearest_neighbor_model.fit(sparse_matrix)

In [None]:
def recommend_product(product_name):
    # Finding the index of the product in the pivot table
    product_index = product_name_to_index[product_name]

    # Finding the nearest neighbors of the given product
    distances, indices = model.kneighbors(sparse_matrix.getrow(product_index), n_neighbors=5)

    # Creating a list of recommended products
    recommended_products = []
    for index in indices.flatten():
        recommended_products.append(product_names[index])

    return recommended_products


In [None]:
recommend_product('1% Vitamin A Retinol Serum')

['The Moisturizing Matte Lotion',
 'The Eye Balm Intense',
 'The Moisturizing Soft Lotion',
 'The Cleansing Foam',
 'The Lifting and Firming Mask']

### Checking for the Accuracy of the model

In [None]:
nn_data = new_data[['author_id','product_name_x', 'rating_x']]

In [None]:
nn_data.columns

Index(['author_id', 'product_name_x', 'rating_x'], dtype='object')

In [None]:
# Split the dataset into training and testing sets

train_data, test_data = train_test_split(new_data, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score

# Assuming you have a test set with the same structure as `new_data`

# Create the test sparse matrix
test_row_indices = np.array([author_id_to_index[id] for id in test_data['author_id']])
test_col_indices = np.array([product_name_to_index[id] for id in test_data['product_name_x']])
test_ratings = np.array(test_data['rating_x'])
test_sparse_matrix = csr_matrix((test_ratings, (test_row_indices, test_col_indices)), shape=(len(author_ids), len(product_names)))


In [None]:
def recommend_product(product_name):
    # Finding the index of the product in the pivot table
    product_index = product_name_to_index[product_name]

    # Finding the nearest neighbors of the given product
    distances, indices = model.kneighbors(sparse_matrix.getrow(product_index), n_neighbors=5)

    # Creating a list of recommended products
    recommended_products = []
    for index in indices.flatten():
        recommended_products.append(product_names[index])

    return recommended_products

In [None]:
from sklearn.metrics import accuracy_score

test_sparse_matrix = csr_matrix((test_ratings, (test_row_indices, test_col_indices)), shape=(len(author_ids), len(product_names)))

# Evaluate the model's accuracy on the test set
def evaluate_accuracy(test_sparse_matrix, product_names):
    true_labels = []
    predicted_labels = []

    for product_name in test_data['product_name_x']:
        # Get the true recommendation from the 'is_recommended' column
        true_label = test_data.loc[test_data['product_name_x'] == product_name, 'is_recommended'].values
        if len(true_label) > 0:
            true_labels.append(true_label[0])

            # Get the predicted recommendation from the recommend_product function
            recommended_products = recommend_product(product_name)
            if product_name in recommended_products:
                predicted_label = 1
            else:
                predicted_label = 0
            predicted_labels.append(predicted_label)

    # Check if either true_labels or predicted_labels is empty
    if len(true_labels) == 0 or len(predicted_labels) == 0:
        return 0.0  # Return 0.0 accuracy if there are no valid labels

    accuracy = accuracy_score(true_labels, predicted_labels)
    return accuracy

# Calculate the accuracy on the test set
accuracy = evaluate_accuracy(test_sparse_matrix, product_names)
print(f"Accuracy: {accuracy}")


TypeError: ignored

### Model 2-Cold start

In [None]:
df_encoded.shape

(630808, 1549)

In [None]:
sample_size = len(df_encoded) // 4  # Calculate the sample size

sample_1 = df_encoded.sample(n=sample_size, random_state=42)  # First sample
remaining_df = df_encoded.drop(sample_1.index)  # Remove the selected rows from the DataFrame

sample_2 = remaining_df.sample(n=sample_size, random_state=42)  # Second sample
remaining_df = remaining_df.drop(sample_2.index)

sample_3 = remaining_df.sample(n=sample_size, random_state=42)  # Third sample
remaining_df = remaining_df.drop(sample_3.index)

sample_4 = remaining_df.sample(n=sample_size, random_state=42)  # Fourth sample

In [None]:
sample_1.shape

In [None]:
# sample_2.shape

In [None]:
# sample_3.shape

In [None]:
# sample_4.shape

In [None]:
from sklearn.neighbors import NearestNeighbors
# instantiating Nearest Neighbors
nearest_neighbor_model = NearestNeighbors(metric='cosine', algorithm='brute')

#fitting the model to the sparse matrix
model_1 = nearest_neighbor_model.fit(sample_1)

In [None]:
# Create a DataFrame with user attributes
user_data = pd.DataFrame({
    'product_name_x': [''],
    'skin_tone': ['medium'],
    'eye_color': ['blue'],
    'skin_type': ['combination'],
    'hair_color': ['black'],
    'rating_x': [4.5413],
    'price_usd_x': [50],
    'is_recommended': [1],
    'helpfulness': [1]
})

# Apply one-hot encoding to user attributes
user_encoded = pd.get_dummies(user_data, columns=['skin_tone', 'eye_color', 'skin_type', 'hair_color'])


In [None]:
user_encoded.columns

Index(['product_name_x', 'rating_x', 'price_usd_x', 'is_recommended',
       'helpfulness', 'skin_tone_medium', 'eye_color_blue',
       'skin_type_combination', 'hair_color_black'],
      dtype='object')

In [None]:
# Reindex the user_encoded DataFrame to match the training data's columns
user_encoded = user_encoded.reindex(columns=sample_1.columns, fill_value=0)

In [None]:
# Find the nearest neighbors of the user data
distances, indices = model_1.kneighbors(user_encoded)

In [None]:
# Find the nearest neighbors of the user data
distances, indices = model_1.kneighbors(user_encoded)

# Retrieve the recommended product name x from the nearest neighbors
recommended_product_name_x = sample_1.index[indices[0][0]]
recommended_product_name_x

316064

In [None]:
recommended_product_row = sample_1.loc[recommended_product_name_x]
recommended_product_row

loves_count              1.00000
rating_x                 0.84739
price_usd_x              1.00000
is_recommended           0.00000
helpfulness              0.00000
                          ...   
hair_color_red           0.00000
skin_type_combination    1.00000
skin_type_dry            0.00000
skin_type_normal         0.00000
skin_type_oily           0.00000
Name: 316064, Length: 1549, dtype: float64

In [None]:
column_name = recommended_product_row.index[recommended_product_row.index.str.startswith('product_name_x')][0]


In [None]:
column_name

'product_name_x_"B" Oil'

In [None]:
recommended_product_names