In [56]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('customer_profiles.csv')

# Convert categorical variables to numerical form
df = pd.get_dummies(df, columns=['gender', 'ever_married'])

"""
state_counts = df['home_state'].value_counts()

# Replace each 'home_state' value with its frequency
df['home_state_freq'] = df['home_state'].map(state_counts)
"""

df['is_Florida'] = df['home_state'].apply(lambda x: 1 if x == 'Florida' else 0)

# Standardize numerical variables
scaler = StandardScaler()
df[['age', 'income', 'kids']] = scaler.fit_transform(df[['age', 'income', 'kids']])

# Combine relevant columns into feature matrix
X = df[['income', 'kids', 'gender_F', 'gender_M', 'ever_married_No', 'ever_married_Yes', "is_Florida"]]

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
df['cluster'] = dbscan.fit_predict(X)

# Display the clusters
print(df[['income', 'kids', 'cluster']])

         income      kids  cluster
0      2.157419  0.887310        0
1      1.601801 -1.145928        1
2      0.212756  0.887310        2
3     -0.574370  2.242802        3
4     -0.666973  2.242802        3
...         ...       ...      ...
14820 -0.528068 -1.145928       28
14821  0.305359  1.565056       19
14822  0.351660  0.209564        7
14823 -0.713274 -1.145928       31
14824  0.768374  1.565056       21

[14825 rows x 3 columns]


In [57]:
df["cluster"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, -1, 75, 76, 77, 78, 79, 80],
      dtype=int64)

In [58]:
c0 = 0
c1 = 0
c2 = 0
c3 = 0
c4 = 0
c5 = 0
c6 = 0
c7 = 0
c_1 = 0
for i in range(len(df["cluster"])):
    if df["cluster"][i] == 0:
        c0 += 1
    elif df["cluster"][i] == 1:
        c1 += 1
    elif df["cluster"][i] == 2:
        c2 += 1
    elif df["cluster"][i] == 3:
        c3 += 1
    elif df["cluster"][i] == 4:
        c4 += 1
    elif df["cluster"][i] == 5:
        c5 += 1
    elif df["cluster"][i] == 6:
        c6 += 1
    elif df["cluster"][i] == 7:
        c7 += 1
    elif df["cluster"][i] == -1:
        c_1 += 1
print(c0, c1, c2, c3, c4, c5, c6, c7, c_1)

313 264 124 180 312 480 1088 888 18


In [63]:
# Apply DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=10)
df['cluster'] = dbscan.fit_predict(X)

# Count the number of points in each cluster
cluster_counts = df['cluster'].value_counts()

# Sort the clusters by size and get the top 3 largest clusters
top_clusters = cluster_counts.nlargest(5)

print(top_clusters)


3    3501
4    3196
5    2258
0    2116
2     953
Name: cluster, dtype: int64


In [64]:
cluster_groups = df.groupby('cluster')

# Iterate over each cluster and display the mean of each feature
for name, group in cluster_groups:
    print(f"Cluster {name}:")
    print(group.mean())
    print("\n")

Cluster -1:
age                                  1.195018e+00
became_member_on (year month day)    2.017066e+07
income                               2.487681e-01
kids                                 1.342590e-01
gender_F                             0.000000e+00
gender_M                             0.000000e+00
gender_O                             1.000000e+00
ever_married_No                      5.555556e-01
ever_married_Yes                     4.444444e-01
is_Florida                           1.000000e+00
cluster                             -1.000000e+00
dtype: float64


Cluster 0:
age                                 -2.497561e-01
became_member_on (year month day)    2.016711e+07
income                               1.884016e-01
kids                                 3.244049e-02
gender_F                             1.000000e+00
gender_M                             0.000000e+00
gender_O                             0.000000e+00
ever_married_No                      1.000000e+00
ever_marri

  print(group.mean())
