In [1]:
# Importing necessary libraries
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from functions import optimize_kmeans, transform_columns, plot_correlation_circle, optimize_dbscan, optimize_agglomerative_clustering, split_dataframe, split_dataframe_by_time

warnings.simplefilter(action='ignore', category=FutureWarning)

# Apply Seaborn's aesthetic parameters to match Matplotlib's style
sns.set()

# Set pandas display option for maximum columns
pd.set_option('display.max_columns', 50)

DATA_PATH = "/Users/typhaine/Documents/Doc_Gorilla/OpenClassroom--Machine-Learning-Engineer/P4/data/"
DATA_LOAD_FILE = DATA_PATH + "data_cleaned_tagged.csv.gz"
SAMPLE_SIZE = 10_000
SPLITS = 9
RANDOM_STATE = 42
COMPONENTS = 3

VARIABLE_COLUMNS = ["Recency", "Frequency", "PaymentInstallments", "TotalPaymentValue", "MeanProductPrice", "NumberOfProductsInOrder"]

df_original = pd.read_csv(DATA_LOAD_FILE)

df_original 

Unnamed: 0,Recency,Frequency,PaymentInstallments,TotalPaymentValue,MeanProductPrice,NumberOfProductsInOrder,OrderTimestamp,K-Mean Label,AgglomerativeLabel
0,1.310017,-5.199338,0.075345,0.418728,0.478025,-5.199338,2017-05-16 15:05:35,1,6
1,0.048948,-5.199338,1.355761,1.378171,1.431971,-5.199338,2018-01-12 20:48:24,1,1
2,-0.798769,-5.199338,1.189522,0.511236,0.718603,-5.199338,2018-05-19 16:07:45,1,5
3,-0.308427,-5.199338,-5.199338,0.640817,0.883727,-5.199338,2018-03-13 16:06:38,0,4
4,-1.464186,-5.199338,1.355761,1.096560,1.260030,-5.199338,2018-07-29 09:51:30,1,1
...,...,...,...,...,...,...,...,...,...
98644,-0.475214,-5.199338,1.059682,-0.199965,0.005182,-5.199338,2018-04-07 15:48:17,1,1
98645,-0.457033,-5.199338,0.389783,0.253758,0.354078,-5.199338,2018-04-04 08:20:22,1,3
98646,-0.485069,-5.199338,0.871679,-0.812153,-0.798769,-5.199338,2018-04-08 20:11:50,1,6
98647,0.450080,-5.199338,0.075345,2.007797,2.030984,-5.199338,2017-11-03 21:08:33,1,3


In [13]:
df_splits = split_dataframe_by_time(df = df_original, datetime_col = "OrderTimestamp", n_splits = 8)

In [14]:
from sklearn.metrics import adjusted_rand_score
for key, df in df_splits.items():
    if df.shape[0]>1:
        ag_labels = AgglomerativeClustering(n_clusters=8, linkage="single").fit(df[VARIABLE_COLUMNS].values).labels_
        print(adjusted_rand_score(df["AgglomerativeLabel"].values, ag_labels))
        #df_transformed.loc[df.index.tolist(), "AgglomerativeLabel"] = ag_labels

for key, df in df_splits.items():
    if df.shape[0]>1:
        ag_labels = KMeans(n_clusters=3, random_state=RANDOM_STATE, n_init="auto").fit(df[VARIABLE_COLUMNS].values).labels_
        print(adjusted_rand_score(df["K-Mean Label"].values, ag_labels))

    #KMeans(n_clusters=3, random_state=RANDOM_STATE, n_init="auto").fit(df_transformed.values).labels_

0.06590213212292442
0.08934791298785469
0.0801144873347538
0.07852579688435954
0.08230026544699962
0.07890688502343272
0.0774809358409169
0.07766953347207407
1.0
0.7075589393045463
1.0
1.0
1.0
1.0
0.739441070465763
1.0
