In [1]:
import pandas as pd
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the data
file_path = 'customer_details_v3.csv'  # Update with your actual file path
customer_data = pd.read_csv(file_path)

# Define the current date for RFM analysis
current_date = datetime.strptime('2018-10-18', '%Y-%m-%d')

# Calculate recency (number of days since the last order)
customer_data['last_order'] = pd.to_datetime(customer_data['last_order'])
customer_data['recency'] = (current_date - customer_data['last_order']).dt.days

# Calculate frequency using the 'connection_days' column
customer_data['frequency'] = customer_data['connection_days']

# Calculate monetary value using the 'total_spent' column
customer_data['monetary'] = customer_data['total_spent']

# Ensure the columns are numeric
customer_data['recency'] = pd.to_numeric(customer_data['recency'], errors='coerce')
customer_data['frequency'] = pd.to_numeric(customer_data['frequency'], errors='coerce')
customer_data['monetary'] = pd.to_numeric(customer_data['monetary'], errors='coerce')
customer_data['avg_review_score'] = pd.to_numeric(customer_data['avg_review_score'], errors='coerce')

# Handle missing values by replacing them with the mean of the column
customer_data['recency'].fillna(customer_data['recency'].mean(), inplace=True)
customer_data['frequency'].fillna(customer_data['frequency'].mean(), inplace=True)
customer_data['monetary'].fillna(customer_data['monetary'].mean(), inplace=True)
customer_data['avg_review_score'].fillna(customer_data['avg_review_score'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_data['recency'].fillna(customer_data['recency'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_data['frequency'].fillna(customer_data['frequency'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never

In [2]:
def stability_test(data, k, random_state=42):
    # Split the data into 3 equal parts
    part1, part2, part3 = np.array_split(data, 3)

    # Standard scaling on part1
    scaler1 = StandardScaler()
    part1_scaled = scaler1.fit_transform(part1)

    # KMeans on part1
    kmeans1 = KMeans(n_clusters=k, random_state=random_state).fit(part1_scaled)
    labels1 = kmeans1.labels_

    # Standard scaling on part2
    scaler2 = StandardScaler()
    part2_scaled = scaler2.fit_transform(part2)

    # KMeans on part2
    kmeans2 = KMeans(n_clusters=k, random_state=random_state).fit(part2_scaled)
    labels2 = kmeans2.labels_

    # Apply scaler1 to part3 and predict labels with kmeans1
    part3_scaled_with_scaler1 = scaler1.transform(part3)
    labels3_1 = kmeans1.predict(part3_scaled_with_scaler1)

    # Apply scaler2 to part3 and predict labels with kmeans2
    part3_scaled_with_scaler2 = scaler2.transform(part3)
    labels3_2 = kmeans2.predict(part3_scaled_with_scaler2)

    # Compare the labels
    ari_3_1_vs_3_2 = adjusted_rand_score(labels3_1, labels3_2)
    
    return ari_3_1_vs_3_2

In [5]:
# Select the necessary columns for clustering
features = ['recency', 'frequency', 'monetary', 'avg_review_score']
# features = ['recency', 'frequency', 'monetary']
clustering_features = customer_data[features]

# Test the function
k = 3
ari_score = stability_test(clustering_features, k)
print(f'Stability ARI Score: {ari_score}')

Stability ARI Score: 0.5020559472110425


  return bound(*args, **kwds)


In [6]:
# Select the necessary columns for clustering
features = ['recency', 'frequency', 'monetary', 'avg_review_score']
# features = ['recency', 'frequency', 'monetary']
clustering_features = customer_data[features]

# Test the function
k = 4
ari_score = stability_test(clustering_features, k)
print(f'Stability ARI Score: {ari_score}')

Stability ARI Score: 0.9781080959431614


  return bound(*args, **kwds)
