
![alt text](https://i.imgur.com/HRhd2Y0.png)

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from functions import find_csv_files, load_csvs_to_dict, sort_and_classify_column, transform_to_days

pd.set_option('display.max_columns', 50)

DATA_PATH = "/workspaces/OpenClassroom--Machine-Learning-Engineer/P4/data/"

csv_files = find_csv_files(DATA_PATH)

dfs = load_csvs_to_dict(csv_files)

### RFM Method ###
# Recency DF
recency_df = sort_and_classify_column(dfs["olist_orders_dataset"], column_name = "order_purchase_timestamp", datetime = True)
recency_df["Recency"] = transform_to_days(recency_df["order_purchase_timestamp"])
recency_df.drop(columns = ["order_approved_at", "order_id", "order_delivered_carrier_date", "order_purchase_timestamp", "order_delivered_customer_date", "order_estimated_delivery_date"], inplace = True)

# Frequency DF
frequency_df = dfs["olist_orders_dataset"].merge(dfs["olist_customers_dataset"], on = "customer_id").groupby(by = "customer_unique_id").count().reset_index().loc[:, ["customer_unique_id" ,"order_id"]]
frequency_df.rename(columns = {"order_id": "Frequecy"}, inplace = True)
frequency_df = sort_and_classify_column(frequency_df, column_name = "Frequecy")

# Monetary DF
per_order_payment = dfs["olist_order_payments_dataset"].groupby(by = "order_id").sum().reset_index()
monetary_df = dfs["olist_orders_dataset"].merge(per_order_payment, on = "order_id")
monetary_df = sort_and_classify_column(monetary_df, column_name = "payment_value")
monetary_df.drop(columns = ["order_approved_at", "order_status", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date"], inplace = True)

# RFM DF
df = dfs["olist_customers_dataset"].merge(recency_df, how = "left",  on = "customer_id"
                                     ).merge(frequency_df, how = "left", on = "customer_unique_id"
                                             ).merge(monetary_df, how = "left", on="customer_id")
df

  per_order_payment = dfs["olist_order_payments_dataset"].groupby(by = "order_id").sum().reset_index()


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_status,order_purchase_timestamp_class,Recency,Frequecy,Frequecy_class,order_id,order_purchase_timestamp,payment_sequential,payment_installments,payment_value,payment_value_class
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,delivered,Low,2348,1,Low,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1.0,2.0,146.87,Low
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,delivered,Medium,2107,1,Low,29150127e6685892b6eab3eec79f59c7,2018-01-12 20:48:24,1.0,8.0,335.48,Low
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,delivered,High,1980,1,Low,b2059ed67ce144a36e2aa97d2c9e9ad2,2018-05-19 16:07:45,1.0,7.0,157.73,Low
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,delivered,High,2047,1,Low,951670f92359f4fe4a63112aa7306eba,2018-03-13 16:06:38,1.0,1.0,173.30,Low
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,delivered,High,1909,1,Low,6b7d50bd145f6fc7f33cebabd7e49d0f,2018-07-29 09:51:30,1.0,8.0,252.25,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,delivered,High,2022,1,Low,6760e20addcf0121e9d58f2f1ff14298,2018-04-07 15:48:17,1.0,6.0,88.78,Low
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,delivered,High,2025,1,Low,9ec0c8947d973db4f4e8dcf1fbfa8f1b,2018-04-04 08:20:22,1.0,3.0,129.06,Low
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,delivered,High,2021,1,Low,fed4434add09a6f332ea398efd656a5c,2018-04-08 20:11:50,1.0,5.0,56.04,Low
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,delivered,Medium,2177,1,Low,e31ec91cea1ecf97797787471f98a8c2,2017-11-03 21:08:33,1.0,2.0,711.07,Low


In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

def optimize_kmeans(df, k_range=(2, 11), batch_size=100):
    """
    Apply and optimize K-means clustering on a given DataFrame.
    
    Parameters:
    - df: DataFrame, data for clustering
    - k_range: tuple, range of k values to try (inclusive)
    
    Returns:
    - dict, containing optimal k and metrics
    """
    # Initialize variables to store metrics
    k_values = []
    inertias = []
    silhouette_scores = []
    davies_bouldin_scores = []
    
    # Loop through different values of k to find the optimal one
    for k in range(k_range[0], k_range[1]):
        # Fit K-means model
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42).fit(df)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Calculate metrics
        inertia = kmeans.inertia_
        silhouette = silhouette_score(df, labels)
        davies_bouldin = davies_bouldin_score(df, labels)
        
        # Store metrics
        k_values.append(k)
        inertias.append(inertia)
        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        
    # Finding the optimal k based on metrics
    # Lower inertia and Davies-Bouldin score is better. Higher silhouette score is better.
    optimal_k = k_values[np.argmin(inertias)]  # Change this based on the metric you prioritize
    
    # Compile metrics
    metrics = {
        'k_values': k_values,
        'inertias': inertias,
        'silhouette_scores': silhouette_scores,
        'davies_bouldin_scores': davies_bouldin_scores,
        'optimal_k': optimal_k
    }
    
    return metrics

In [5]:
tmp_df = df[["Recency","Frequecy","payment_value"]].dropna()

#optimize_kmeans(tmp_df)

In [4]:
df_items_per_order = dfs["olist_order_items_dataset"].loc[:, ["order_id", "product_id"]].groupby(by = "order_id").count().reset_index().rename(columns={"product_id":"number_of_products"})

In [5]:
df_mean_product_price = dfs["olist_order_items_dataset"].loc[:, ["product_id", "price", "freight_value"]].groupby(by = "product_id").mean().reset_index()
df_product_measurements = dfs["olist_products_dataset"].loc[:, ["product_id", "product_weight_g", "product_length_cm", "product_height_cm", "product_width_cm"]]
df_product_price_measures = df_mean_product_price.merge(df_product_measurements, on = "product_id", how = "left")
df_product_order_info = dfs["olist_order_items_dataset"].loc[:, ["order_id", "product_id"]].merge(df_product_price_measures, on = "product_id", how = "left")
df_basket_info = df_product_order_info.drop(columns=["product_id"]).groupby(by = "order_id").mean().reset_index().merge(df_items_per_order, on = "order_id", how = "right")
df_basket_info

Unnamed: 0,order_id,price,freight_value,product_weight_g,product_length_cm,product_height_cm,product_width_cm,number_of_products
0,00010242fe8c5a6d1ba2dd792cb16214,59.233333,22.033333,650.0,28.0,9.0,14.0,1
1,00018f77f2f0320c557190d7a144bdd3,239.900000,19.930000,30000.0,50.0,30.0,40.0,1
2,000229ec398224ef6ca0657da4fc703e,199.000000,18.606667,3050.0,33.0,13.0,33.0,1
3,00024acbcdf0a6daa1e931b038114c75,12.990000,12.790000,200.0,16.0,10.0,15.0,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,202.400000,27.052500,3750.0,35.0,40.0,30.0,1
...,...,...,...,...,...,...,...,...
98661,fffc94f6ce00a00581880bf54a75a037,291.185319,41.179787,10150.0,89.0,15.0,40.0,1
98662,fffcd46ef2263f404302a634eb57f7eb,355.000000,27.465000,8950.0,45.0,26.0,38.0,1
98663,fffce4705a9662cd70adb13d4a31832d,93.157143,17.602857,967.0,21.0,24.0,19.0,1
98664,fffe18544ffabc95dfada21779c9644f,55.899091,12.528182,100.0,20.0,20.0,20.0,1


In [6]:
df.merge(df_basket_info, how = "left", on = "order_id")

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_status,order_purchase_timestamp_class,Recency,Frequecy,Frequecy_class,order_id,order_purchase_timestamp,payment_sequential,payment_installments,payment_value,payment_value_class,price,freight_value,product_weight_g,product_length_cm,product_height_cm,product_width_cm,number_of_products
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,delivered,Low,2348,1,Low,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1.0,2.0,146.87,Low,119.362881,31.543220,8683.0,54.0,64.0,31.0,1.0
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,delivered,Medium,2107,1,Low,29150127e6685892b6eab3eec79f59c7,2018-01-12 20:48:24,1.0,8.0,335.48,Low,291.185319,41.179787,10150.0,89.0,15.0,40.0,1.0
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,delivered,High,1980,1,Low,b2059ed67ce144a36e2aa97d2c9e9ad2,2018-05-19 16:07:45,1.0,7.0,157.73,Low,143.958000,42.509000,8267.0,52.0,52.0,17.0,1.0
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,delivered,High,2047,1,Low,951670f92359f4fe4a63112aa7306eba,2018-03-13 16:06:38,1.0,1.0,173.30,Low,164.758095,37.736667,12160.0,56.0,51.0,28.0,1.0
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,delivered,High,1909,1,Low,6b7d50bd145f6fc7f33cebabd7e49d0f,2018-07-29 09:51:30,1.0,8.0,252.25,Low,230.000000,35.017500,5200.0,45.0,15.0,35.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,delivered,High,2022,1,Low,6760e20addcf0121e9d58f2f1ff14298,2018-04-07 15:48:17,1.0,6.0,88.78,Low,78.900000,17.278000,611.0,22.0,22.0,23.0,1.0
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,delivered,High,2025,1,Low,9ec0c8947d973db4f4e8dcf1fbfa8f1b,2018-04-04 08:20:22,1.0,3.0,129.06,Low,106.914286,24.420000,1211.0,25.0,24.0,22.0,1.0
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,delivered,High,2021,1,Low,fed4434add09a6f332ea398efd656a5c,2018-04-08 20:11:50,1.0,5.0,56.04,Low,37.000000,18.050000,870.0,25.0,20.0,18.0,1.0
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,delivered,Medium,2177,1,Low,e31ec91cea1ecf97797787471f98a8c2,2017-11-03 21:08:33,1.0,2.0,711.07,Low,644.988444,31.671333,710.0,19.0,13.0,14.0,1.0
