In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from pandas.core.common import random_state

In [3]:
def Data_Standardising(X):

  # standardising the data
  scaler = StandardScaler()
  data_scaled = scaler.fit_transform(X)

  return data_scaled

In [4]:
def KMeans_Clustering(n_clusters, scores_pca):

  # defining the kmeans function with initialization as k-means++
  kmeans = KMeans(n_clusters, init='k-means++', random_state = 42)

  # fitting the k means algorithm on the data
  kmeans_pca = kmeans.fit(scores_pca)

  return kmeans_pca

In [5]:
def PCA_Function(X_scaled, n_components):

  pca = PCA(n_components)

  pca.fit(X_scaled)

  pca.transform(X_scaled)

  scores_pca = pca.transform(X_scaled)

  return scores_pca

In [6]:
def Cluster_Evolution(frequency, df_kmeans, df, day, month, year):

  # Creation of dataframe of returning customers
  Frequent_Customers = df_kmeans.loc[df_kmeans["frequency"] >= frequency]

  # Creation of dataframe to test with required dates
  Selected_Dates = Order_Dataset[Order_Dataset['Date_of_last_purchase']<dt.datetime(year,month,day)]

  # Creation of dataframe with orignal data for the required dates
  Original_Selected_Dates = Frequent_Customers[Frequent_Customers['Date_of_last_purchase']<dt.datetime(year,month,day)]

  # Labels for the original selected dates
  Labels_true = np.array(Original_Selected_Dates['Cluster'])

  # Creation of dataframe with required columns for the required dates
  Testing = Selected_Dates[['Seconds_Since_Last_Order', 'Sum_of_Prices', 'frequency']]

  # Data standardising function
  df_scaled = Data_Standardising(Testing)

  # PCA function
  scores_pca = PCA_Function(df_scaled, 3)

  #KMeans
  kmeans_pca = KMeans_Clustering(4, scores_pca)

  # Dataframe creation with components and clusters
  New_kmeans_pca = pd.concat([Selected_Dates.reset_index(drop = True), pd.DataFrame(scores_pca)], axis = 1)
  New_kmeans_pca.columns.values[-3:] = ['Component 1', 'Component 2', 'Component 3']
  New_kmeans_pca['Cluster'] = kmeans_pca.labels_

  # Creation of new dataframe with frequency equal or above requested
  Customers_df = New_kmeans_pca.loc[New_kmeans_pca["frequency"] >= frequency]  
  kmeans_pca.labels = np.array(Customers_df["Cluster"])
  print(adjusted_rand_score(Labels_true, kmeans_pca.labels))

In [7]:
df_kmeans = pd.read_csv("/content/drive/MyDrive/df_kmeans.csv")
Order_Dataset = pd.read_csv("/content/drive/MyDrive/Order_Dataset.csv")

In [8]:
Order_Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95963 entries, 0 to 95962
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     95963 non-null  int64  
 1   customer_unique_id             95963 non-null  object 
 2   Date_of_last_purchase          95963 non-null  object 
 3   last_date                      95963 non-null  object 
 4   Difference                     95963 non-null  object 
 5   Seconds_Since_Last_Order       95963 non-null  float64
 6   Sum_of_Prices                  95963 non-null  float64
 7   price                          95963 non-null  float64
 8   payment_value                  95963 non-null  float64
 9   payment_sequential             95963 non-null  float64
 10  payment_type                   95963 non-null  object 
 11  payment_installments           95963 non-null  float64
 12  review_score                   95963 non-null 

In [9]:
# Saving to CSV file converts datetime type back to object type hence a requirement to reconvert back to datetime

Order_Dataset['Date_of_last_purchase'] = pd.to_datetime(Order_Dataset['Date_of_last_purchase'])

df_kmeans['Date_of_last_purchase'] = Order_Dataset['Date_of_last_purchase']

In [10]:
frequency = 1
year = 2018
month = 8
day = 12

Adjusted_Rand_Score = Cluster_Evolution(frequency, df_kmeans, Order_Dataset, day, month, year)

0.7281672851343137


12/8/2018 Rand index = 0.73, 91946 customers

13/8/2018 Rand index = 0.91, 92143 customers

Last customer purchase = 3/9/2018

Therefore new cluster simulation required every 3 weeks

In [11]:
frequency = 2
year = 2018
month = 2
day = 14

Adjusted_Rand_Score = Cluster_Evolution(frequency, df_kmeans, Order_Dataset, day, month, year)

0.7777915898422755


For returning customers the Rand Index drops below 0.8 between 7/7/17 - 18/7/17

and 3/1/18 - 14/2/2018, therefore retraining is not needed 