# Create Product Data + Time as A Part of Input

Using data from product data historical as a benchmark of target variable. Thereby, the prediction use recommendation with time series.

In [118]:
import time


# Memulai waktu
start_time = time.time()

In [221]:
#Data Importing
import pandas as pd
import numpy as np

#Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go

#Preprocessing
import math


#Modelling
from surprise import Reader
from surprise import Dataset
import datetime
from surprise import SVD, NMF, KNNBasic, BaselineOnly, NormalPredictor
from surprise.model_selection import cross_validate
from surprise import BaselineOnly

from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV




# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer

#Evaluation
from sklearn.metrics import classification_report, accuracy_score, make_scorer, mean_squared_error, f1_score, confusion_matrix, r2_score

#Additional
import warnings
warnings.filterwarnings("ignore") 
from tqdm import tqdm
import os 

# Open Data

In [120]:
df = pd.read_csv('purchase_history.csv', sep = ';')

In [121]:
df = df[['customer_id', 'product_id', 'purchase_date']]

In [122]:
df_prod = pd.read_csv('product_details.csv', sep = ';')

In [123]:
df_prod = df_prod[['product_id', 'category', 'price', 'ratings']]

In [124]:
df_prod

Unnamed: 0,product_id,category,price,ratings
0,101,Electronics,500,4.5
1,102,Clothing,50,3.8
2,103,Home & Kitchen,200,4.2
3,104,Beauty,30,4.0
4,105,Electronics,800,4.8


In [125]:
df_cus = pd.read_csv('customer_interactions.csv', sep = ',')

In [126]:
df_cus

Unnamed: 0,customer_id,page_views,time_spent
0,1,25,120
1,2,20,90
2,3,30,150
3,4,15,80
4,5,22,110


In [127]:
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

In [128]:
df = pd.merge(df, df_cus, on = ['customer_id'])
df = pd.merge(df, df_prod, on = ['product_id'])

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    6 non-null      int64         
 1   product_id     6 non-null      int64         
 2   purchase_date  6 non-null      datetime64[ns]
 3   page_views     6 non-null      int64         
 4   time_spent     6 non-null      int64         
 5   category       6 non-null      object        
 6   price          6 non-null      int64         
 7   ratings        6 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 516.0+ bytes


In [172]:
import plotly.express as px
df_period = df.groupby('purchase_date')['customer_id'].count().reset_index()


# Assuming df_period is your DataFrame
# Extract only the date part
df_period['purchase_date'] = df_period['purchase_date'].dt.date.astype(str)

# Plot the data using Plotly Express
fig = px.line(df_period, x='purchase_date', y='customer_id')

# Update the layout
fig.update_layout(
    title='Customer Count Trends',
    xaxis_title='Date',
    yaxis_title='Count'
)

# Set the x-axis type to 'category' to ensure correct ordering
fig.update_xaxes(type='category')
fig.update_yaxes(type='category')


# Show the plot
fig.show()


# Cleansing Data

### Duplicated Data

In [131]:
# Periksa duplikat
print("Jumlah duplikat:", df.duplicated().sum())

Jumlah duplikat: 0


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    6 non-null      int64         
 1   product_id     6 non-null      int64         
 2   purchase_date  6 non-null      datetime64[ns]
 3   page_views     6 non-null      int64         
 4   time_spent     6 non-null      int64         
 5   category       6 non-null      object        
 6   price          6 non-null      int64         
 7   ratings        6 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 516.0+ bytes


### Outlier

In [133]:
# Check outlier
# No any data that far away from others.

def create_box_plot(column):
    trace = go.Box(y=df[column], name=column, boxpoints='all')
    return trace

box_plot_data = [create_box_plot(col) for col in df.columns if df[col].dtype not in ['object', 'datetime64[ns]']]

layout = go.Layout(title='Historical Data Distribution Summary')
fig = go.Figure(data=box_plot_data, layout=layout)

fig.show()


In [134]:
#Berdasarkan analisa saya, semua data bisa dimaklumi outliernya.
# Namun, hal ini harus saya buktikan dengan modified z-score
def get_mad(s):
    median = np.median(s)
    diff = abs(s-median)
    MAD = np.median(diff)
    return MAD
MAD = get_mad(df['product_id'])
median = np.median(df['product_id'])
MAD, median
def get_modified_z_score(x, median, MAD):
    return 0.6745*(x-median)/MAD
df['mod_z_score'] = df['product_id'].apply(lambda x:get_modified_z_score(x, median, MAD))
df.head()

Unnamed: 0,customer_id,product_id,purchase_date,page_views,time_spent,category,price,ratings,mod_z_score
0,1,101,2023-01-01,25,120,Electronics,500,4.5,-0.6745
1,5,101,2023-01-05,22,110,Electronics,500,4.5,-0.6745
2,1,105,2023-01-05,25,120,Electronics,800,4.8,1.124167
3,2,102,2023-01-02,20,90,Clothing,50,3.8,-0.224833
4,3,103,2023-01-03,30,150,Home & Kitchen,200,4.2,0.224833


In [135]:
# Standar umum data disebut outlier jika modified z scorenya lebih dari 3.5
# Terbukti bahwa tidak ada data outlier
df[df.mod_z_score>3.5]

Unnamed: 0,customer_id,product_id,purchase_date,page_views,time_spent,category,price,ratings,mod_z_score


In [136]:
df.drop(columns = ['mod_z_score'], inplace = True)

### Missing Values

In [137]:
df.isnull().sum()

customer_id      0
product_id       0
purchase_date    0
page_views       0
time_spent       0
category         0
price            0
ratings          0
dtype: int64

# Exploratory Data Analysis (EDA)

For getting insight from data.

In [138]:
# product apa yang memiliki rating tertinggi? 105 dengan rating 4.8 where 105 is in electronic category which the price is 800. Ini berarti pelanggannya adalah kelas atas
# 

In [139]:
import plotly.express as px

# Assuming 'product_id' is the column containing product IDs and 'ratings' is the column containing ratings data in your DataFrame 'df'

# Group by product ID and get the mean rating for each product
df_product_per_rating = df.groupby('product_id')['ratings'].mean().reset_index()

# Sort the DataFrame by ratings
df_product_per_rating = df_product_per_rating.sort_values(by='ratings', ascending=False)

# Plot the data using Plotly Express
fig = px.bar(df_product_per_rating, y='ratings', x='product_id', title='Number of Products per Rating')

# Update axis labels
fig.update_xaxes(title='Product ID')
fig.update_yaxes(title='Ratings')

# Show the plot
fig.show()


In [140]:
import plotly.express as px

# Assuming 'product_id' is the column containing product IDs and 'ratings' is the column containing ratings data in your DataFrame 'df'

# Group by product ID and get the mean rating for each product
df_product_per_rating = df.groupby('product_id')['price'].mean().reset_index()

# Sort the DataFrame by ratings
df_product_per_rating = df_product_per_rating.sort_values(by='price', ascending=False)

# Plot the data using Plotly Express
fig = px.bar(df_product_per_rating, y='price', x='product_id', title='Number of Products per Rating')

# Update axis labels
fig.update_xaxes(title='Product ID')
fig.update_yaxes(title='price')

# Show the plot
fig.show()


In [141]:
# dimana terbukti produk apa yang paling lama dipertimbangkan? prod 103 paling tinggi dipertimbangkan dimana terlihat ratingnya tinggi dengan harga yang tidak terllau mahal

In [142]:
import plotly.express as px

# Assuming 'product_id' is the column containing product IDs and 'ratings' is the column containing ratings data in your DataFrame 'df'

# Group by product ID and get the mean rating for each product
df_product_per_rating = df.groupby('product_id')['time_spent'].mean().reset_index()

# Sort the DataFrame by ratings
df_product_per_rating = df_product_per_rating.sort_values(by='time_spent', ascending=False)

# Plot the data using Plotly Express
fig = px.bar(df_product_per_rating, y='time_spent', x='product_id', title='Number of Products per Time Spent')

# Update axis labels
fig.update_xaxes(title='Product ID')
fig.update_yaxes(title='Time Spent')

# Show the plot
fig.show()


In [143]:
# tgl berapa yang paling banyak dilihat? semakin tinggi tanggalnya semakin lama dilihat sehingga ada potensi untuk meningkatkan produk dengan harga dan kualitas tinggi untuk kelas atas.


In [144]:
import plotly.express as px
df_period = df.groupby('purchase_date')['page_views'].sum().reset_index()


# Assuming df_period is your DataFrame
# Extract only the date part
df_period['purchase_date'] = df_period['purchase_date'].dt.date.astype(str)

# Plot the data using Plotly Express
fig = px.line(df_period, x='purchase_date', y='page_views')

# Update the layout
fig.update_layout(
    title='Page Views Trends',
    xaxis_title='Date',
    yaxis_title='Sum of page'
)

# Set the x-axis type to 'category' to ensure correct ordering
fig.update_xaxes(type='category')
fig.update_yaxes(type='category')


# Show the plot
fig.show()


# Modelling Recommendation Menggunakan Surprise

In [196]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['customer_id','product_id','ratings']], reader)

In [219]:
import datetime

classes = (SVD, NMF, KNNBasic, BaselineOnly, NormalPredictor)

results = []

for model in classes:


    start = datetime.datetime.now()
    out = cross_validate(model(), data, ['rmse', 'mae'])
    mean_rmse = '%.3f' % np.mean(out['test_rmse'])
    mean_mae = '%.3f' % np.mean(out['test_mae'])
    results.append({'Model': model.__name__, 'RMSE': mean_rmse, 'MAE': mean_mae})


print('All models have run. Call the CVResults dataframe for results.')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
All models have run. Call the CVResults dataframe for results.


In [220]:
results

[{'Model': 'SVD', 'RMSE': '0.306', 'MAE': '0.301'},
 {'Model': 'NMF', 'RMSE': '0.509', 'MAE': '0.485'},
 {'Model': 'KNNBasic', 'RMSE': '0.386', 'MAE': '0.378'},
 {'Model': 'BaselineOnly', 'RMSE': '0.382', 'MAE': '0.379'},
 {'Model': 'NormalPredictor', 'RMSE': '0.388', 'MAE': '0.360'}]

In [222]:
trainset, testset = train_test_split(data, test_size=0.25,random_state=0)

In [223]:
algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

In [226]:
accuracy.rmse(predictions)

RMSE: 0.0967


0.09668919357933879

# Hyperparameter Tuning

In [227]:
param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005],
              'reg_all': [0.02, 0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [228]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.4229493793569321
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}


In [229]:
# nilai lebih baik menggunakan parameter default SVD

# Save Model

In [230]:
from joblib import dump, load

# Simpan model
dump(algo, 'svd.joblib')

['svd.joblib']