<a href="https://colab.research.google.com/github/Yassaadi/e_commerce/blob/main/Assaadi_Yassine_3_notebook_simulation_112022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')
from IPython.display import Image

Mounted at /content/drive


In [None]:
#BASIC LIBs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#STAT TOOLS
import math
from math import sqrt
from scipy import stats #BoxCox
import random
from numpy.lib.function_base import percentile
from pandas.core.algorithms import quantile

#GRAPHS
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

#TRANSFORMATION
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.decomposition import PCA

#Classifiers
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors

#METRICS
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import silhouette_score, silhouette_samples, adjusted_rand_score,accuracy_score

#DATA SPLITING 
from sklearn.model_selection import cross_val_score, KFold, train_test_split

#IMPUTERS
from sklearn.impute import SimpleImputer, KNNImputer # IterativeImputer

#GRID SEARCH
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#PIPELINES
from sklearn.pipeline import Pipeline, make_pipeline 

#Columns & rows size
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None) 


# Prepared data loading

In [None]:
Path = '/content/drive/MyDrive/Colab Notebooks/P5-Ecom /P5-Maintenance_Frequency.csv'
df = pd.read_csv(Path, encoding="utf-8") 
df = df.drop('index', axis= 1)
df["order_purchase_timestamp"] = pd.to_datetime(df["order_purchase_timestamp"])
df = df.dropna(axis=0)

# Period definition

In [None]:
period_reference = df["order_purchase_timestamp"].min() # Maybe adjust to periods of year

df["Month"]= ((df["order_purchase_timestamp"]-period_reference)/np.timedelta64(1, 'M')).astype(int)+1
df["Day"]= ((df["order_purchase_timestamp"]-period_reference)/np.timedelta64(1, 'D')).astype(int)+1

Data_recentDate= df["order_purchase_timestamp"].max()
Data_LatestDate= df["order_purchase_timestamp"].min()

Data_PeriodM = int((df["order_purchase_timestamp"].max()-df["order_purchase_timestamp"].min())/np.timedelta64(1, 'M'))+1
Data_PeriodD = int((df["order_purchase_timestamp"].max()-df["order_purchase_timestamp"].min())/np.timedelta64(1, 'D'))+1

print(f" Data_recentDate: {Data_recentDate} \n Data_LatestDate: {Data_LatestDate} \n Data_Period: {Data_PeriodD}")

 Data_recentDate: 2018-08-29 15:00:37 
 Data_LatestDate: 2016-10-03 09:44:50 
 Data_Period: 696


# Features transformation function

## 1.1. Period data generator 

In [None]:
def df_periodicity(df, p, periodicity): 

  dfP =df[(df["Day"]<= (periodicity*p)) & (df["Day"] > (periodicity*(p-1))) ]

  dfX = dfP.groupby(["customer_unique_id", 'longitude', 'latitude'], as_index=False).agg(                                                                     
                                                                      frequency               = ('order_id', 'nunique'),                             
                                                                      monetary                = ("payment_value", "sum"),                                                                      
                                                                      delivery_delay          = ('delivery_delay', 'mean'),
                                                                      score                   = ("review_score", "mean"),                                                                      
                                                                      n_payments              = ('payment_installments' , 'max'),                                     
                                                                      n_items                 = ('order_item_id','count'),
                                                                      n_sellers               = ('seller_id', 'count'),
                                                                      price                   = ('price','sum'),
                                                                      freight                 = ('freight_value','sum'),
                                                                      n_categories            = ("product_category_name_english" , "nunique"),
                                                                      recent_order            = ("order_purchase_timestamp", "max")                                                                  
                                                                     )             

  dfX['freight_per'] = dfX['freight']/dfX['price']                                                                      

  dfX["recency"] = (dfX['recent_order'].max() - dfX['recent_order']).dt.days  

  return dfX

## 2. Data transformation

### Data positive

In [None]:
def positive_df(df_XP, num_variables ):
  for column in num_variables:
    
    if len(df_XP[column].unique()) >1:
      if (df_XP[column].min() <= 0.0) & (df_XP[column].max() > 0.0) : 
        df_XP[column] = df_XP[column] - df_XP[column].min() + df_XP[df_XP[column]>0][column].min()
      
      if df_XP[column].max() <= 0.0:
        df_XP[column] = df_XP[column] - df_XP[column].min() - df_XP[df_XP[column]<0][column].max()
  
  return df_XP   

### Logarithm/ BoxCox

In [None]:
LMBDAS = []
def BoxCox_df(df_XB, BoxCox_list):  
  for column in BoxCox_list: 
    if len(df_XB[column].unique()) >1:     
      data, lmbda = stats.boxcox(df_XB[column], lmbda=None)
      df_XB[column] = data
      LMBDAS.append(lmbda)    
  return df_XB

### Normalization

In [None]:
def Normalization_df(df_XN, num_variables):
    Normalization = ColumnTransformer([("Scal", StandardScaler(), num_variables)])
    if df_XN.shape[0]>1:
      Normalized = pd.DataFrame(Normalization.fit_transform(df_XN), index= df_XN.index, columns = num_variables)   
      for column in num_variables:   
        df_XN[column] = Normalized[column]

    return df_XN

In [None]:
dfX = df_periodicity(df, 1, 1)
dfX.shape

(9, 16)

In [None]:
RFM_variables = ["recency", "frequency", "monetary"]
                
Extra_variables =[]   

In [None]:
dfX = dfX.loc[:, RFM_variables]
dfX

Unnamed: 0,recency,frequency,monetary
0,0,1,92.27
1,0,1,44.23
2,0,1,39.09
3,0,1,45.46
4,0,1,53.73
5,0,1,154.57
6,0,1,40.95
7,0,1,133.46
8,0,1,35.61


In [None]:
non_num = ['customer_unique_id', 'recent_order'] 
num_variables = dfX.loc[:, ~dfX.columns.isin(non_num) ].columns

In [None]:
  df_XP = positive_df(dfX, num_variables) 
  df_XB = BoxCox_df(df_XP, num_variables)
  df_XN = Normalization_df(df_XB, num_variables)
  df_XB

Unnamed: 0,recency,frequency,monetary
0,0.0,0.0,1.031435
1,0.0,0.0,-0.512699
2,0.0,0.0,-0.947656
3,0.0,0.0,-0.425153
4,0.0,0.0,0.046389
5,0.0,0.0,1.506656
6,0.0,0.0,-0.775822
7,0.0,0.0,1.401777
8,0.0,0.0,-1.324927


## Clustering

In [None]:
RFM_variables = ["recency", "frequency", "monetary"]
                
Extra_variables =[]        

In [None]:
X = pd.get_dummies(df_XN.loc[:, RFM_variables + Extra_variables])
pca = PCA(2)
X_pca = pca.fit_transform(X)
print(f'X shape: {X.shape}')
print(f'X_pca shape: {X_pca.shape}')
X.head(2)

X shape: (9, 3)
X_pca shape: (9, 2)


Unnamed: 0,recency,frequency,monetary
0,0.0,0.0,1.031435
1,0.0,0.0,-0.512699


Accuracy of uncertain labeling clustering

In [None]:
def accuracy_label(label_a, label_b):
  couples=[]
  n=len(label_a)
  ka=np.unique(label_a)
  kb=np.unique(label_b).tolist()
  label_b_= label_b

  for i in range(n):
    couples.append([label_a[i], label_b[i]])
  
  unique_couples, counts =np.unique(np.array(couples), axis=0, return_counts=True)

  for i in ka:
    for j in unique_couples:
      if (j[0]==i): #unique couple found
       if (j[0]==j[1]) & (j[0] in kb): #(labeling is good)
        kb.remove(j[0])
#        print(kb)
#        print(label_b_)
       else:
        if (j[0] in kb) & (j[1] in kb)  :
          label_b_ = np.where(label_b_== i, -2, label_b_)
          label_b_ = np.where(label_b_== j[1], i, label_b_)
          label_b_ = np.where(label_b_== -2, j[1], label_b_)
          kb.remove(j[0])
#          print(kb)
#          print(label_b_)
            
#  print(label_b_)
  return accuracy_score(label_a, label_b_)

In [None]:
a=[1,2,3,4,4,3]
b=[3,2,1,4,4,1]
accuracy_label(a, b)

[2, 3, 4]
[1 2 3 4 4 3]
[3, 4]
[1 2 3 4 4 3]
[3]
[1 2 3 4 4 3]
[1 2 3 4 4 3]


1.0

In [None]:
frequency=[90, 120, 150, 180] #Fréquence de maintenance FM=4mois
periodicity = [30]

SHT1=[]
ARSP=[]
ARS1=[]
LABELS=[]
ACCURACY=[]
P0=[]

for fm in frequency:
  for F in periodicity:    
    ACC = []
    SHT_1 = []
    ARS_P = [] 
    ARS_1 = []
    LABELS_P = []
    p0=[]
    #************************First period model**************************
    #************************Features engineering**************************
    dfX = df_periodicity(df, 1, F)
    dfX = dfX.dropna(axis=0)
    #************************Features trasnformation**************************
    df_XP= positive_df(dfX, num_variables) 
    df_XB = BoxCox_df(df_XP, num_variables)
    df_XN = Normalization_df(df_XB, num_variables)
      
    #************************Creation of model features dataframe**************************
    X = pd.get_dummies(df_XN.loc[:, RFM_variables + Extra_variables])
    
    #************************Model**************************
    P0_clusters = 4
    p0.append(1)
    Pf=0
    kmeans_1 = KMeans(n_clusters = P0_clusters)
    labels = kmeans_1.fit_predict(X) 
    
    kmeans_P0 = KMeans(n_clusters = P0_clusters)
    kmeans_P0.fit(X) 
    acc=1
    sht = silhouette_score(X, labels)
    ars_P0 = 1
    ars_1 = 1
    print(f'\n------------------Periodicity = {F}days, Maintenance frequency = {int(fm/30)}months----------------------')
    print(f'\nPriodicity = {F} days, period n°1')
    print(f'Data shape : {X.shape}, Accuracy : {round(sht,2)}')
    LABELS_P.append(labels)
    ACC.append(acc)
    SHT_1.append(sht)
    ARS_P.append(ars_P0)
    ARS_1.append(ars_1)

    for p in range(1, math.ceil(Data_PeriodD/F)):
      #************************Features engineering**************************
      dfX = df_periodicity(df, p, F)    
      #************************Features trasnformation**************************
      df_XP= positive_df(dfX, num_variables) 
      df_XB = BoxCox_df(df_XP, num_variables)
      df_XN = Normalization_df(df_XB, num_variables)    
      #************************Creation of model features dataframe**************************
      X = pd.get_dummies(df_XN.loc[:, RFM_variables + Extra_variables])
    
      #************************Model prediction**************************     
      P_clusters = 4
      if X.shape[0]>P_clusters :
        kmeans_P = KMeans(n_clusters = P_clusters)
        #************************Predictions**************************
        if int((p*F)/fm)>Pf:
          Pf=Pf+1  
          p0.append(p+1)  
          kmeans_P0=kmeans_P 

        labels_P = kmeans_P.fit_predict(X) #Predicted clustering by ACTUAL period model
        labels_P0 = kmeans_P0.predict(X) #Predicted clustering by FIRST period model    
        labels_1 = kmeans_1.predict(X) #Predicted clustering by LAST period model
        
        #************************Metrics**************************
        acc = accuracy_label(labels_P0,labels_P)
        sht = silhouette_score(X, labels_P0)
        ars_P0 = adjusted_rand_score(labels_P0, labels_P)
        ars_1 = adjusted_rand_score(labels_1, labels_P)

        print(f'Periodicity = {F} days, period n°{p+1}')
        print(f'Data shape : {X.shape}, Accuracy  : {round(acc,2)}, ARS(KMeans_{p0[-1]+1}, KMeans_{p+1})={round(ars_P0,2)}, ARS(KMeans_{1}, KMeans_{p+1})={round(ars_1,2)}')
        ACC.append(acc)
        SHT_1.append(sht)
        ARS_1.append(ars_1)
        ARS_P.append(ars_P0)
        LABELS_P.append(labels_P)
      else:
        print(f'Frequence = {F} days, period n°{p+1}')
        print(f'Data shape : {X.shape}')
        #Keep same metrics in case of data absence
        ACC.append(acc)
        SHT_1.append(sht) 
        ARS_P.append(ars_P0)
        ARS_1.append(ars_1)
        LABELS_P.append([])
      
  #************************************************************************  
    ACCURACY.append((ACC,fm))
    SHT1.append((SHT_1, fm))
    ARSP.append((ARS_P, fm))
    ARS1.append((ARS_1, fm)) 
    LABELS.append((LABELS_P,fm))
    P0.append(p0)


------------------Periodicity = 30days, Maintenance frequency = 3months----------------------

Priodicity = 30 days, period n°1
Data shape : (263, 3), Accuracy : 0.39
Periodicity = 30 days, period n°2
Data shape : (263, 3), Accuracy  : 0.98, ARS(KMeans_2, KMeans_2)=0.96, ARS(KMeans_1, KMeans_2)=1.0
Frequence = 30 days, period n°3
Data shape : (0, 3)
Frequence = 30 days, period n°4
Data shape : (1, 3)
Periodicity = 30 days, period n°5
Data shape : (648, 3), Accuracy  : 1.0, ARS(KMeans_6, KMeans_5)=1.0, ARS(KMeans_1, KMeans_5)=0.33
Periodicity = 30 days, period n°6
Data shape : (1726, 3), Accuracy  : 0.88, ARS(KMeans_6, KMeans_6)=0.7, ARS(KMeans_1, KMeans_6)=0.57
Periodicity = 30 days, period n°7
Data shape : (2336, 3), Accuracy  : 1.0, ARS(KMeans_8, KMeans_7)=1.0, ARS(KMeans_1, KMeans_7)=0.77
Periodicity = 30 days, period n°8
Data shape : (2193, 3), Accuracy  : 0.21, ARS(KMeans_8, KMeans_8)=0.53, ARS(KMeans_1, KMeans_8)=0.41
Periodicity = 30 days, period n°9
Data shape : (3261, 3), Acc

Periods

In [None]:
graph = ['ARI-1', 'ARI-P0']#, 'Accuracy', 'Silhouette']

for fm in range(len(frequency)):
  for f in range(len(periodicity)):
    t=f+fm*len(periodicity)
    p = np.arange(1, math.ceil(Data_PeriodD/periodicity[f])+1)
    x = p
    y = [ARS1[t][0], ARSP[t][0], ACCURACY[t][0], SHT1[1][0]]

    fig = go.Figure()
    for i in range(len(graph)):
      fig.add_trace(go.Scatter(x=x, y=y[i],
                        mode='lines+markers',
                        name=graph[i]
                        ))
    #Limit line  
    fig.add_hline(y=0.8, line_dash="dot",line_color="green",
              annotation_text='limit', 
              annotation_font_size=10)
    fig.add_hline(y=0.7, line_dash="dot",line_color='orange',
              annotation_text='limit',               
              annotation_font_size=10)
    fig.add_hline(y=0.5, line_dash="dot", line_color='red',
              annotation_text='limit', 
              annotation_font_size=10)
        
    #Vertcial period rectangles    
    for i in range(0,len(P0[t])-1):
      colors =f'rgb({random.randint(0,255)}, {random.randint(0,255)}, {random.randint(0,255)})'
      fig.add_vrect(x0=P0[t][i], x1=P0[t][i+1], 
                annotation_text="decline", annotation_position="top left",
                fillcolor=colors, opacity=0.25, line_width=0) 
    
    colors =f'rgb({random.randint(0,255)}, {random.randint(0,255)}, {random.randint(0,255)})'
    fig.add_vrect(x0=P0[t][len(P0[t])-1], x1=p[-1], 
              annotation_text="decline", annotation_position="top left",
              fillcolor=colors, opacity=0.25, line_width=0)


    annotations = []
    # Graph title
    annotations.append(dict(xref='paper', yref='paper', y=1.1, x=0.5,
                                  xanchor='center', yanchor='top',
                                  text=f'Accuracy and ARI scores VS period, Period={periodicity[f]}days, Frequency={int(frequency[fm]/30)} months',                                
                                  font=dict(family='Arial', size=20, color='rgb(37,37,37)'),                                                                                  
                                  showarrow=False))

    fig.update_layout(annotations=annotations)

    fig.show()