# Cluster Analysis and Profiling

In this notebook, we will analyze the characteristics of each cluster and try to profile them.

In [1]:
import pandas as pd
from data_utils import load_and_scale_data
from sklearn.cluster import KMeans

# Load original and scaled data
df, X_scaled_df = load_and_scale_data()

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X_scaled_df)
df['Cluster'] = y_kmeans

# Select only numeric columns for aggregation
numeric_cols = df.select_dtypes(include='number').columns

# Compute mean values per cluster (numeric columns only)
cluster_profiles = df.groupby('Cluster')[numeric_cols].mean()

print(cluster_profiles)

print('\nPossible Customer Types:')
income_mean = df['Annual Income (k$)'].mean()
spending_mean = df['Spending Score (1-100)'].mean()

for cluster in cluster_profiles.index:
    print(f'Cluster {cluster}:')
    income = cluster_profiles.loc[cluster, 'Annual Income (k$)']
    spending = cluster_profiles.loc[cluster, 'Spending Score (1-100)']
    if income > income_mean and spending > spending_mean:
        print('  High Income, High Spending')
    elif income > income_mean and spending <= spending_mean:
        print('  High Income, Low Spending')
    elif income <= income_mean and spending > spending_mean:
        print('  Low Income, High Spending')
    else:
        print('  Low Income, Low Spending')


         CustomerID        Age  Annual Income (k$)  Spending Score (1-100)  \
Cluster                                                                      
0         83.872340  55.638298           54.382979               48.851064   
1        159.743590  39.871795           86.102564               19.358974   
2        161.025000  32.875000           86.100000               81.525000   
3         24.100000  46.250000           26.750000               18.350000   
4         55.648148  25.185185           41.092593               62.240741   

         Cluster  
Cluster           
0            0.0  
1            1.0  
2            2.0  
3            3.0  
4            4.0  

Possible Customer Types:
Cluster 0:
  Low Income, Low Spending
Cluster 1:
  High Income, Low Spending
Cluster 2:
  High Income, High Spending
Cluster 3:
  Low Income, Low Spending
Cluster 4:
  Low Income, High Spending


In [2]:
import os
import pandas as pd # Make sure pandas is imported if not already

# --- Start of new code to add ---

# Define the path for the output directory
output_dir = 'outputs/'

# Get and print the current working directory for debugging
print("Current Working Directory (CWD):", os.getcwd())

# Get and print the absolute path where 'outputs' is expected to be
absolute_output_dir = os.path.abspath(output_dir)
print("Absolute path for 'outputs' directory:", absolute_output_dir)

# Create the directory if it doesn't exist
# exist_ok=True means it won't raise an error if the directory already exists
try:
    os.makedirs(output_dir, exist_ok=True)
    print(f"Directory '{output_dir}' (resolved to '{absolute_output_dir}') ensured to exist.")
except OSError as e:
    print(f"Error creating directory '{output_dir}': {e}")
    print("This often indicates a permissions issue or an invalid path character.")
    print("Please check if you have write permissions to the parent directory.")
    # Re-raise the exception to make it clear if directory creation failed
    raise

# --- End of new code to add ---

# Your existing line to save the DataFrame
df.to_csv(os.path.join(output_dir, 'clustered_customers.csv'), index=False)

print(f"DataFrame successfully saved to {os.path.join(output_dir, 'clustered_customers.csv')}")

OSError: Cannot save file into a non-existent directory: 'outputs'