In [29]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('customer_profiles.csv')

# Convert categorical variables to numerical form
df = pd.get_dummies(df, columns=['gender', 'ever_married'])

# Standardize numerical variables
scaler = StandardScaler()
df[['age', 'income', 'kids']] = scaler.fit_transform(df[['age', 'income', 'kids']])

# Combine relevant columns into feature matrix
X = df[['age', 'income', 'kids', 'gender_F', 'gender_M', 'ever_married_No', 'ever_married_Yes']]

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
df['cluster'] = dbscan.fit_predict(X)

# Display the clusters
print(df[['age', 'income', 'kids', 'cluster']])

            age    income      kids  cluster
0      0.034889  2.157419  0.887310        0
1      1.185430  1.601801 -1.145928        1
2      0.782741  0.212756  0.887310        2
3      0.610160 -0.574370  2.242802        3
4      0.207470 -0.666973  2.242802        3
...         ...       ...       ...      ...
14820 -0.540382 -0.528068 -1.145928       19
14821  0.380051  0.305359  1.565056       16
14822 -0.310274  0.351660  0.209564        7
14823  1.645647 -0.713274 -1.145928       19
14824  0.437578  0.768374  1.565056       17

[14825 rows x 4 columns]


In [30]:
df["cluster"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, -1, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75, 76], dtype=int64)

In [31]:
c0 = 0
c1 = 0
c2 = 0
c3 = 0
c4 = 0
c5 = 0
c6 = 0
c7 = 0
c_1 = 0
for i in range(len(df["cluster"])):
    if df["cluster"][i] == 0:
        c0 += 1
    elif df["cluster"][i] == 1:
        c1 += 1
    elif df["cluster"][i] == 2:
        c2 += 1
    elif df["cluster"][i] == 3:
        c3 += 1
    elif df["cluster"][i] == 4:
        c4 += 1
    elif df["cluster"][i] == 5:
        c5 += 1
    elif df["cluster"][i] == 6:
        c6 += 1
    elif df["cluster"][i] == 7:
        c7 += 1
    elif df["cluster"][i] == -1:
        c_1 += 1
print(c0, c1, c2, c3, c4, c5, c6, c7, c_1)

442 970 646 230 420 611 1385 1120 71


In [61]:
# Apply DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=0)
df['cluster'] = dbscan.fit_predict(X)

# Count the number of points in each cluster
cluster_counts = df['cluster'].value_counts()

# Sort the clusters by size and get the top 3 largest clusters
top_3_clusters = cluster_counts.nlargest(5)

print(top_3_clusters)


2    4454
3    4030
1    3201
0    2928
4     110
Name: cluster, dtype: int64
