In [451]:
import numpy as np
import pandas as pd

In [452]:
# Load data
data = pd.read_csv('D:\crime_data.csv', index_col=0)

In [453]:
data.head()

Unnamed: 0_level_0,Murder,Assault,UrbanPopulation,Rape
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,13.2,236,58,21.2
Alaska,10.0,263,48,44.5
Arizona,8.1,294,80,31.0
Arkansas,8.8,190,50,19.5
California,9.0,276,91,40.6


In [454]:
data.describe()

Unnamed: 0,Murder,Assault,UrbanPopulation,Rape
count,50.0,50.0,50.0,50.0
mean,7.788,170.76,65.54,21.232
std,4.35551,83.337661,14.474763,9.366385
min,0.8,45.0,32.0,7.3
25%,4.075,109.0,54.5,15.075
50%,7.25,159.0,66.0,20.1
75%,11.25,249.0,77.75,26.175
max,17.4,337.0,91.0,46.0


In [455]:
# Normalize the data
data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

In [456]:
#Manhattan Distance
def distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [457]:
def k_means_clustering(data, k):
    # Choose initial centroids randomly
    initial_centroids = data.sample(n=k, random_state=1)
    centroids = initial_centroids.copy()

    clusters = {i: [] for i in range(k)}

    # Assign each data point to its nearest centroid
    for index, row in data.iterrows():
        distances = [distance(row[1:], centroids.iloc[i][1:]) for i in range(k)]
        closest_centroid_index = np.argmin(distances)
        clusters[closest_centroid_index].append(index)

    # Update centroids
    for i in range(k):
        centroids.iloc[i] = data.loc[clusters[i]].mean()

    # Continue iterating until maximum number of iterations
    max_iterations = 100
    iteration = 0
    while True:
        
        new_clusters = {i: [] for i in range(k)}

        # Assign each data point to its nearest centroid
        for index, row in data.iterrows():
            distances = [distance(row[1:], centroids.iloc[i][1:]) for i in range(k)]
            closest_centroid_index = np.argmin(distances)
            new_clusters[closest_centroid_index].append(index)

        # converge if no points have changed clusters
        if new_clusters == clusters:
            break

        # Otherwise, update clusters and centroids
        clusters = new_clusters
        for i in range(k):
            centroids.iloc[i] = data.loc[clusters[i]].mean()

        # Check if maximum number of iterations reached
        iteration += 1
        if iteration >= max_iterations:
            break

    return clusters, centroids 

In [458]:
# Get user input for number of clusters
k = int(input('Enter the number of clusters (k): '))

In [459]:
# Cluster the data
clusters, centroids = k_means_clustering(data, k)

In [460]:
# Print clusters
for i in range(k):
    cluster_indices = clusters[i]
    cluster_data = data.loc[cluster_indices]
    cluster_states = cluster_data.index.tolist()
    print(f'Cluster {i+1}:\n {cluster_states}')

    
# Detect and remove outliers using IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)
if outliers.any():
    print("\nOutlier records:")
    print(data[outliers])
    data = data[~outliers]  

Cluster 1:
 ['Alabama', 'Alaska', 'Arizona', 'California', 'Colorado', 'Delaware', 'Florida', 'Georgia', 'Illinois', 'Louisiana', 'Maryland', 'Michigan', 'Missouri', 'Nevada', 'New Mexico', 'New York', 'Oregon', 'South Carolina', 'Tennessee', 'Texas', 'Washington']
Cluster 2:
 ['Arkansas', 'Connecticut', 'Hawaii', 'Idaho', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Maine', 'Massachusetts', 'Minnesota', 'Mississippi', 'Montana', 'Nebraska', 'New Hampshire', 'New Jersey', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Rhode Island', 'South Dakota', 'Utah', 'Vermont', 'Virginia', 'West Virginia', 'Wisconsin', 'Wyoming']

Outlier records:
          Murder   Assault  UrbanPopulation      Rape
State                                                
Alaska  0.513019  1.118060        -1.224067  2.509424
Nevada  1.023254  0.984726         1.078909  2.671197
