In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [53]:
data = pd.read_csv("Mall_Customers.csv")

In [54]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [55]:
for column in data.columns:
    if data[column].dtype=='object':
        uniques = data[column].unique()
        print(f"Column {column} has {len(uniques)} number of unique values")
        print(uniques)
        print("\n")
    else:
        min_value = data[column].min()
        max_value = data[column].max()
        print(f"Column {column} has values in range {min_value} to {max_value}")
        print("\n")

Column CustomerID has values in range 1 to 200


Column Genre has 2 number of unique values
['Male' 'Female']


Column Age has values in range 18 to 70


Column Annual Income (k$) has values in range 15 to 137


Column Spending Score (1-100) has values in range 1 to 99




In [56]:
data=data.drop("Genre", axis=1)
data

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
0,1,19,15,39
1,2,21,15,81
2,3,20,16,6
3,4,23,16,77
4,5,31,17,40
5,6,22,17,76
6,7,35,18,6
7,8,23,18,94
8,9,64,19,3
9,10,30,19,72


In [57]:
data.rename(columns={'CustomerID': 'custid', 'age': 'age', 'Annual Income (k$)': 'income', 'Spending Score (1-100)': 'spendrate'}, inplace=True)

In [58]:
data.head()

Unnamed: 0,custid,Age,income,spendrate
0,1,19,15,39
1,2,21,15,81
2,3,20,16,6
3,4,23,16,77
4,5,31,17,40


In [59]:
data.shape[0]

200

In [60]:
data.shape[1]

4

In [61]:
distances = np.zeros((data.shape[0], data.shape[0]))

In [62]:
distances.shape

(200, 200)

In [63]:
from scipy.spatial import distance

for i in range(distances.shape[0]):
    for j in range(distances.shape[0]):
        distances[i, j] = distance.euclidean(data.iloc[i, 1:], data.iloc[j, 1:])

In [82]:
np.set_printoptions(suppress=True, precision=2)
distances.mean()

50.29809993137302

In [91]:
import numpy as np

def calculate_distance_matrix(data):
    num_rows = data.shape[0]
    distances = np.zeros((num_rows, num_rows))

    for i in range(num_rows):
        for j in range(num_rows):
            distances[i, j] = distance.euclidean(data.iloc[i, 1:], data.iloc[j, 1:])

    return distances

def dbscan(distance_matrix, epsilon, min_samples):
    num_samples = distance_matrix.shape[0]
    labels = np.full(num_samples, -1)  # Initialize all points as noise (-1).

    cluster_id = 0

    for i in range(num_samples):
        if labels[i] != -1:  # Skip points that have already been assigned to a cluster.
            continue

        neighbors = [j for j in range(num_samples) if distance_matrix[i, j] <= epsilon]

        if len(neighbors) < min_samples:
            labels[i] = -1  # Mark as noise.
        else:
            cluster_id += 1
            expand_cluster(distance_matrix, labels, i, neighbors, cluster_id, epsilon, min_samples)

    return labels

def expand_cluster(distance_matrix, labels, point_idx, neighbors, cluster_id, epsilon, min_samples):
    labels[point_idx] = cluster_id

    i = 0
    while i < len(neighbors):
        neighbor_idx = neighbors[i]
        if labels[neighbor_idx] == -1:  # Neighbor not assigned to any cluster.
            labels[neighbor_idx] = cluster_id
            new_neighbors = [j for j in range(len(distance_matrix)) if distance_matrix[neighbor_idx, j] <= epsilon]
            if len(new_neighbors) >= min_samples:
                neighbors += new_neighbors
        i += 1

# Example usage:
epsilon = 10 # Set your desired value.
min_samples = 4  # Set your desired value.
distance_matrix = calculate_distance_matrix(data)  # Calculate the distance matrix.

cluster_labels = dbscan(distance_matrix, epsilon, min_samples)


In [92]:
cluster_labels

array([-1,  1, -1,  1,  2,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,  1,  2,
        1, -1, -1,  2,  1, -1,  1, -1,  1, -1, -1,  2,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1, -1,  3,  3, -1,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  4, -1,  4,  3,  4, -1,  4,  6,  4, -1,  4,  5,  4,
       -1,  4,  5,  4, -1,  4, -1,  4,  5,  4, -1,  4, -1,  4,  6,  4,  6,
        4,  6,  4, -1,  4, -1,  4, -1,  4,  5,  4, -1,  4,  6,  4,  6, -1,
        6,  4, -1,  4, -1,  4, -1,  4, -1,  4, -1,  4, -1,  4, -1,  4, -1,
       -1, -1,  4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])