<a href="https://colab.research.google.com/github/bryaanabraham/flipkart_personalized_prod_recommendation/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [119]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import hashlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import mean_squared_error

from scipy.sparse.linalg import svds # for sparse matrices

In [120]:
#Import the data set
df = pd.read_csv('/content/events (1).csv', header=None) #There are no headers in the data file

df.columns = ['event_time', 'event_type', 'prod_id', 'cat_id','cat_code','brand','price','user_id','user_sess'] #Adding column names

df = df.drop(['event_time','cat_code','user_sess','brand'], axis=1) #Dropping timestamp

df_copy = df.copy(deep=True) #Copying the data to another dataframe

In [121]:
def hash_to_float(input_str, min_value, max_value):
    # Use SHA-256 hash function
    sha256_hash = hashlib.sha256(repr(input_str).encode()).hexdigest()

    # Convert the hash to a floating-point value and map it to the desired range
    hash_float = int(sha256_hash, 16) / float(int('F'*64, 16))  # Convert hex hash to float
    mapped_float = min_value + (hash_float * (max_value - min_value))  # Map to the desired float range
    return mapped_float

# Define the range for float assignment
min_float = 1.5
max_float = 5.0

# Generate hash-based floats for each product
df['ratings'] = df['prod_id'].apply(lambda x: hash_to_float(x, min_float, max_float))

df.head()

Unnamed: 0,event_type,prod_id,cat_id,price,user_id,ratings
0,event_type,product_id,category_id,price,user_id,4.03267
1,view,1996170,2144415922528452715,31.90,1515915625519388267,2.252158
2,view,139905,2144415926932472027,17.16,1515915625519380411,2.976226
3,view,215454,2144415927158964449,9.81,1515915625513238515,2.613014
4,view,635807,2144415923107266682,113.81,1515915625519014356,4.878066


In [122]:
rows, columns = df.shape
print("No of rows = ", rows)
print("No of columns = ", columns)


No of rows =  61979
No of columns =  6


In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61979 entries, 0 to 61978
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   event_type  61979 non-null  object 
 1   prod_id     61979 non-null  object 
 2   cat_id      61978 non-null  object 
 3   price       61978 non-null  object 
 4   user_id     61978 non-null  object 
 5   ratings     61979 non-null  float64
dtypes: float64(1), object(5)
memory usage: 2.8+ MB


In [124]:
df.isna().sum()

event_type    0
prod_id       0
cat_id        1
price         1
user_id       1
ratings       0
dtype: int64

In [125]:
print('Number of unique USERS in Raw data = ', df['user_id'].nunique())
print('Number of unique ITEMS in Raw data = ', df['prod_id'].nunique())
print('Number of unique ratings in Raw data = ', df['ratings'].nunique())

Number of unique USERS in Raw data =  32683
Number of unique ITEMS in Raw data =  16486
Number of unique ratings in Raw data =  16486


In [126]:
most_active = df.groupby('user_id').size().sort_values(ascending=False)[:10]
most_active

user_id
1515915625444050265    100
1515915625517787550    100
1515915625353514897     86
1515915625522463421     81
1515915625522779180     76
1515915625360775587     66
1515915625520712323     65
1515915625522082569     64
1515915625520191157     63
1515915625425059411     61
dtype: int64

In [127]:
counts = df['user_id'].value_counts()
df_final = df[df['user_id'].isin(counts[counts >= 60].index)]

In [128]:
print('The number of observations in the final data =', len(df_final))
print('Number of unique USERS in the final data = ', df_final['user_id'].nunique())
print('Number of unique PRODUCTS in the final data = ', df_final['prod_id'].nunique())

The number of observations in the final data = 883
Number of unique USERS in the final data =  12
Number of unique PRODUCTS in the final data =  296


In [129]:
best_prod_cat = df_final.groupby('cat_id').size().sort_values(ascending=False)[:10]
best_prod_cat
print('Number of unique ITEMS in final data = ', df_final['prod_id'].nunique())


Number of unique ITEMS in final data =  296


In [130]:
duplicates = df_final[df_final.duplicated(['user_id', 'prod_id'], keep=False)]
# Display duplicates
print(duplicates)

      event_type  prod_id               cat_id   price              user_id  \
331         view  1507368  2144415922016747613  172.86  1515915625360775587   
389         view  1803694  2144415923694469257  240.98  1515915625360775587   
428         view  1803694  2144415923694469257  240.98  1515915625360775587   
458         view  1803694  2144415923694469257  240.98  1515915625360775587   
747         view   910547  2144415923535085701   96.41  1515915625425059411   
...          ...      ...                  ...     ...                  ...   
60509       view   483337  2144415923744800906  365.25  1515915625392598666   
60510       view   669304  2144415923744800906  478.59  1515915625392598666   
60511       view   847412  2144415923744800906  120.00  1515915625392598666   
60512       view    37774  2144415923744800906  167.62  1515915625392598666   
60513       view   247085  2144415923744800906  262.48  1515915625392598666   

        ratings  
331    3.782181  
389    3.290230

In [131]:
df_final_agg = df_final.groupby(['user_id', 'prod_id'], as_index=False)['ratings'].mean()  # Or 'sum()' instead of 'mean()'
df_final_agg

Unnamed: 0,user_id,prod_id,ratings
0,1515915625353514897,247085,1.671026
1,1515915625353514897,322353,4.864812
2,1515915625353514897,37774,1.962064
3,1515915625353514897,3978887,3.955855
4,1515915625353514897,3979025,4.123565
...,...,...,...
305,1515915625522779180,903858,1.932618
306,1515915625523250254,367123,3.764566
307,1515915625523250254,602271,1.546291
308,1515915625523250254,602272,2.146806


In [132]:
mapping = {'view': 1, 'cart': 2, 'others': 3}
df_final_agg['encoded_event'] = df['event_type'].map(mapping)
df_final_agg.head()


Unnamed: 0,user_id,prod_id,ratings,encoded_event
0,1515915625353514897,247085,1.671026,
1,1515915625353514897,322353,4.864812,1.0
2,1515915625353514897,37774,1.962064,1.0
3,1515915625353514897,3978887,3.955855,1.0
4,1515915625353514897,3979025,4.123565,1.0


In [133]:
#Creating the interaction matrix of products and users based on ratings and replacing NaN value with 0
final_ratings_matrix = df_final_agg.pivot(index = 'user_id', columns ='prod_id', values = 'encoded_event').fillna(0)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

#Finding the number of non-zero entries in the interaction matrix
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)

#Finding the possible number of ratings as per the number of users and products
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)

#Density of ratings
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

final_ratings_matrix.head()

Shape of final_ratings_matrix:  (12, 296)
given_num_of_ratings =  291
possible_num_of_ratings =  3552
density: 8.19%


prod_id,1004850,1004851,1009727,1012952,1013041,1013042,1013047,1013048,1013051,1013052,...,917352,917356,919584,937789,938118,940654,947169,951881,951884,952888
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1515915625353514897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625360775587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1515915625392598666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625425059411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625444050265,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
