# Pre-Processing

In [1]:
# import the library
import pandas as pd

In [2]:
# load the training dataset
df_train  = pd.read_csv('Framingham_Training')

# display few rows
df_train

Unnamed: 0,Index,Sex,Age,Educ,Death
0,1,1,39,4,0
1,2,1,52,4,0
2,3,2,46,2,0
3,5,2,58,2,0
4,7,1,54,1,0
...,...,...,...,...,...
7948,11327,1,40,3,0
7949,11329,1,52,3,0
7950,11330,2,39,3,0
7951,11331,2,46,3,0


In [3]:
# Standardization of Age column using z-score method
df_train['Age'] = (df_train['Age'] - df_train['Age'].mean()) / df_train['Age'].std() 
df_train

Unnamed: 0,Index,Sex,Age,Educ,Death
0,1,1,-1.654082,4,0
1,2,1,-0.283734,4,0
2,3,2,-0.916202,2,0
3,5,2,0.348733,2,0
4,7,1,-0.072912,1,0
...,...,...,...,...,...
7948,11327,1,-1.548670,3,0
7949,11329,1,-0.283734,3,0
7950,11330,2,-1.654082,3,0
7951,11331,2,-0.916202,3,0


In [4]:
# Get Sex and Age column for clustering
X_train = df_train[['Sex','Age']].values
X_train

array([[ 1.        , -1.65408155],
       [ 1.        , -0.28373444],
       [ 2.        , -0.91620234],
       ...,
       [ 2.        , -1.65408155],
       [ 2.        , -0.91620234],
       [ 2.        , -0.49455708]])

# Q.1

In [5]:
# build K-Means model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2, random_state = 42)
y_kmeans_train = kmeans.fit_predict(X_train) # fit and predict for X_train
y_kmeans_train

array([0, 0, 0, ..., 0, 0, 0])

# Q.2

In [6]:
cluster1 = X_train[y_kmeans_train == 0] # get data for cluster 1
cluster2 = X_train[y_kmeans_train == 1] # get data for cluster 2

In [7]:
cluster1 # display cluster1 data for training data

array([[ 1.        , -1.65408155],
       [ 1.        , -0.28373444],
       [ 2.        , -0.91620234],
       ...,
       [ 2.        , -1.65408155],
       [ 2.        , -0.91620234],
       [ 2.        , -0.49455708]])

In [8]:
cluster2 # display cluster2 data for training data

array([[2.        , 0.34873345],
       [2.        , 0.6649674 ],
       [2.        , 1.29743529],
       ...,
       [1.        , 0.87579003],
       [2.        , 0.55955608],
       [2.        , 0.13791082]])

In [9]:
# summary statistics for cluster1
cluster1_train_summary = pd.DataFrame(cluster1 ,columns=['Sex','Age'])
cluster1_train_summary.describe()

Unnamed: 0,Sex,Age
count,4336.0,4336.0
mean,1.558118,-0.756019
std,0.496668,0.552146
min,1.0,-2.391961
25%,1.0,-1.127025
50%,2.0,-0.70538
75%,2.0,-0.283734
max,2.0,0.0325


In [10]:
# summary statistics for cluster2
cluster2_train_summary = pd.DataFrame(cluster2 ,columns=['Sex','Age'])
cluster2_train_summary.describe()

Unnamed: 0,Sex,Age
count,3617.0,3617.0
mean,1.579486,0.906303
std,0.49371,0.571531
min,1.0,0.137911
25%,1.0,0.454145
50%,2.0,0.770379
75%,2.0,1.297435
max,2.0,2.773194


-  Cluster1 consists of 4336 combinations of Sex and Age ,Although cluster2 consists of 3617 combinations of Sex and Age

# Q.3

In [11]:
# load test data
df_test  = pd.read_csv('Framingham_Test')
df_test

Unnamed: 0,Index,Sex,Age,Educ,Death
0,4,2,52,2,0
1,6,1,48,1,0
2,10,2,46,3,0
3,14,2,49,2,0
4,16,2,63,1,0
...,...,...,...,...,...
2252,11307,2,56,1,0
2253,11313,1,68,1,1
2254,11323,2,50,1,1
2255,11326,2,58,2,0


In [12]:
# Standardization of Age column using z-score method
df_test['Age'] = (df_test['Age'] - df_test['Age'].mean()) / df_test['Age'].std() 
df_test

Unnamed: 0,Index,Sex,Age,Educ,Death
0,4,2,-0.273782,2,0
1,6,1,-0.686008,1,0
2,10,2,-0.892121,3,0
3,14,2,-0.582951,2,0
4,16,2,0.859838,1,0
...,...,...,...,...,...
2252,11307,2,0.138444,1,0
2253,11313,1,1.375121,1,1
2254,11323,2,-0.479895,1,1
2255,11326,2,0.344556,2,0


In [13]:
# Get Sex and Age column for clustering
X_test = df_test[['Sex','Age']].values
X_test

array([[ 2.        , -0.27378213],
       [ 1.        , -0.68600779],
       [ 2.        , -0.89212062],
       ...,
       [ 2.        , -0.47989496],
       [ 2.        ,  0.34455636],
       [ 1.        , -0.89212062]])

In [14]:
y_kmeans_test = kmeans.predict(X_test) # predict using X_test
y_kmeans_test

array([0, 0, 0, ..., 0, 1, 0])

# Q.4

In [15]:
# get data for cluster 1
cluster1 = X_test[y_kmeans_test == 0]

# get data for cluster 2
cluster2 = X_test[y_kmeans_test == 1]

In [16]:
cluster1 # display cluster1 for test data

array([[ 2.        , -0.27378213],
       [ 1.        , -0.68600779],
       [ 2.        , -0.89212062],
       ...,
       [ 1.        , -0.0676693 ],
       [ 2.        , -0.47989496],
       [ 1.        , -0.89212062]])

In [17]:
cluster2 # display cluster2 for test data

array([[2.        , 0.85983844],
       [2.        , 0.13844353],
       [2.        , 0.96289485],
       ...,
       [2.        , 0.13844353],
       [1.        , 1.37512051],
       [2.        , 0.34455636]])

In [18]:
# summary statistics for cluster1
cluster1_test_summary = pd.DataFrame(cluster1 ,columns=['Sex','Age'])
cluster1_test_summary.describe()

Unnamed: 0,Sex,Age
count,1245.0,1245.0
mean,1.559036,-0.751484
std,0.496702,0.537188
min,1.0,-2.231854
25%,1.0,-1.098233
50%,2.0,-0.686008
75%,2.0,-0.273782
max,2.0,0.035387


In [19]:
# summary statistics for cluster2
cluster2_test_summary = pd.DataFrame(cluster2 ,columns=['Sex','Age'])
cluster2_test_summary.describe()

Unnamed: 0,Sex,Age
count,1012.0,1012.0
mean,1.591897,0.924503
std,0.491725,0.57043
min,1.0,0.138444
25%,1.0,0.447613
50%,2.0,0.859838
75%,2.0,1.272064
max,2.0,2.714854


# Are your clusters validated?

- Yes, because we have predicted the clustering of test data using the same K-Means Model that was earlier trained using Training set.