# Clustering With K-Means
###Unsupervised Model

**import libraries**
<a id='import libraries'></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

**1. Load the USArrest.csv dataset**
<a id='Load the dataset'></a>

In [None]:
arrest_df=pd.read_csv("https://raw.githubusercontent.com/UrielBender/BigData/master/DataSets/USArrest.csv")

**2. Perform EDA - Explore the data set**
<a id='knowing your data'></a>

In [None]:
arrest_df.isnull().sum()

state       0
Murder      0
Assault     0
UrbanPop    0
Rape        0
dtype: int64

In [None]:
arrest_df.head()

Unnamed: 0,state,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [None]:
arrest_df.shape

(50, 5)

In [None]:
arrest_df.describe()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
count,50.0,50.0,50.0,50.0
mean,7.788,170.76,65.54,21.232
std,4.35551,83.337661,14.474763,9.366385
min,0.8,45.0,32.0,7.3
25%,4.075,109.0,54.5,15.075
50%,7.25,159.0,66.0,20.1
75%,11.25,249.0,77.75,26.175
max,17.4,337.0,91.0,46.0


**3. For visualization and simplicity reasons downgrade the dataframe to 2 dimentions only**

The 2 most corrolated features in this data set are - **Murder & Assult**

In [None]:
fig = go.Figure()
fig.add_traces(go.Heatmap(
    z=arrest_df.corr(), 
    x=arrest_df.corr().columns, 
    y=arrest_df.corr().columns,
    zmax=1, 
    zmin=-1
))
fig.update_layout({
    'title':"Features Correlation Heatmap"
})

In [None]:
arrest_df.drop(columns=['state','Rape', 'UrbanPop'], inplace=True)
arrest_df.head()

Unnamed: 0,Murder,Assault
0,13.2,236
1,10.0,263
2,8.1,294
3,8.8,190
4,9.0,276


#### 4. Scatter plott the data:

In [None]:
fig = go.Figure(data=go.Scatter(x=arrest_df.Murder, y=arrest_df.Assault, mode='markers'))

fig.update_layout(
    title="Murder by Assault",
    xaxis_title="Assault",
    yaxis_title="Murder",
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="RebeccaPurple"
    ),
    width=800
)
fig.show()

#How the K-Means Work??

Initialization: choose k random centroids

Repeat until convergence {

for i=1 to m

c(i)
:= index (from 1 to k) of cluster
centroid which is the closest to xi

c(i)
:= min(k) ||x(i) - μk||2

for k = 1 to K

μk := average of samples assigned to cluster k

}

**Perform K Means Clustering**
<a id='K Means Clustering'></a>

**5. scale the features using Standard Scaler**
<a id='step1:scaling the features'></a>

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(arrest_df)

scaled_df = pd.DataFrame(scaler.transform(arrest_df), columns=arrest_df.columns)
scaled_df.head()

Unnamed: 0,Murder,Assault
0,1.255179,0.790787
1,0.513019,1.11806
2,0.072361,1.493817
3,0.234708,0.233212
4,0.281093,1.275635


In [None]:
round(scaled_df.mean(axis=0), 2)

Murder    -0.0
Assault    0.0
dtype: float64

In [None]:
round(scaled_df.std(axis=0), 1)

Murder     1.0
Assault    1.0
dtype: float64

**import k-means from sklearn**
<a id='step2:import k-means from sklearn'></a> 

In [None]:
from sklearn.cluster import KMeans

**6. create a K-Means model with 3 clusters**
<a id='step3:create a K-Means model with 3 clusters'></a> 

In [None]:
kmeans = KMeans(n_clusters=3, random_state=1234)

**7. Fit and run the model to predict the clustering.**
<a id='step4:fit the model'></a> 

In [None]:
kmeans.fit_predict(scaled_df)

array([2, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 0, 2, 1, 2, 0, 2,
       1, 2, 0, 0, 1, 2, 1, 0, 2, 2, 2, 1, 0, 0, 0, 0, 0, 2, 1, 2, 2, 1,
       1, 0, 0, 1, 1, 0], dtype=int32)

**8. Add the cluster labels to the USA dataframe**

In [None]:
labels = kmeans.labels_
labels

array([2, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 1, 2, 0, 1, 0, 0, 2, 1, 2, 0, 2,
       1, 2, 0, 0, 1, 2, 1, 0, 2, 2, 2, 1, 0, 0, 0, 0, 0, 2, 1, 2, 2, 1,
       1, 0, 0, 1, 1, 0], dtype=int32)

In [None]:
arrest_df['cluster'] = labels
arrest_df.head(10)

Unnamed: 0,Murder,Assault,cluster
0,13.2,236,2
1,10.0,263,2
2,8.1,294,2
3,8.8,190,0
4,9.0,276,2
5,7.9,204,0
6,3.3,110,1
7,5.9,238,0
8,15.4,335,2
9,17.4,211,2


In [None]:
arrest_df['cluster'].value_counts()

2    18
0    18
1    14
Name: cluster, dtype: int64

### Reverse the scaling transform

In [None]:
cluster_means = arrest_df.groupby('cluster').mean() 
cluster_means

Unnamed: 0_level_0,Murder,Assault
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6.672222,152.0
1,3.064286,77.071429
2,12.577778,262.388889


In [None]:
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers

array([[  6.67222222, 152.        ],
       [  3.06428571,  77.07142857],
       [ 12.57777778, 262.38888889]])

**10. plot the dataframe colored by cluster - include the cluster centers**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=arrest_df.Murder, y=arrest_df.Assault, mode='markers',  marker_color=arrest_df.cluster))

fig.add_trace(go.Scatter(x=cluster_centers[:,0], y=cluster_centers[:,1], mode='markers',  marker_color='magenta', marker_size=10))

fig.update_layout(
    title="Murder by Assault",
    xaxis_title="Assault",
    yaxis_title="Murder",

    font=dict(
        family="Courier New, monospace",
        size=14,
        color="RebeccaPurple"
    ),
    width=800
)

fig.show()


#### 11. Find the K  using the Elbow Method

In [None]:
sse = [] # sum of squared error
k_rng = [i for i in range(1,20)]
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(scaled_df)
    sse.append(km.inertia_)

In [None]:
#מראה כלל המרפק- איזה K הכי טוב
fig = go.Figure()
fig.add_trace(go.Scatter(x=k_rng, y=sse, mode='markers+lines'))
fig.update_layout(
    title="Inertia by K",
    xaxis_title="K",
    yaxis_title="Inertia",

    font=dict(
        family="Courier New, monospace",
        size=14,
        color="RebeccaPurple"
    ),
    width=800
)

fig.show()


# End.