# clustering-tds

Use the "Run" button to execute the code.

In [None]:
!pip install jovian --upgrade --quiet

In [None]:
import jovian

In [None]:
# Execute this to save new versions of the notebook
jovian.commit(project="clustering-tds")

**Demo on k-Means clustering and cluster analysis**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

In [2]:
stockData = pd.read_csv('TDS_Week_5_Dataset_2 - Sheet1.csv')    

In [3]:
# All % columns were on a scale where 1 meant 100%, except for Stock Return %. We multiply all % columns (except Returns) by 100
stockData['Free Float Market Cap %'] = stockData['Free Float Market Cap %']*100
stockData['RoE %'] = stockData['RoE %']*100
stockData['RoCE %'] = stockData['RoCE %']*100
stockData['EBIT Margin %'] = stockData['EBIT Margin %']*100
stockData['PAT %'] = stockData['PAT %']*100
stockData.describe()

Unnamed: 0,record number,Price,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,612.0,0.010163,0.015214,38.659175,0.027942,52.906076,56.74158,53.847632,0.022321,51.524376,0.145038
std,353.771348,0.038309,0.060153,16.673399,0.090669,12.632005,7.923857,8.536269,0.051913,8.710919,0.075296
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,306.0,0.001167,0.000601,24.450608,0.00055,50.696507,52.707946,51.687277,0.003966,50.327121,0.10257
50%,612.0,0.003291,0.001707,36.398549,0.002338,54.490617,55.832344,53.847632,0.009215,51.967712,0.132243
75%,918.0,0.00868,0.006294,47.973117,0.010388,58.547111,59.838861,56.672358,0.020459,54.324236,0.171613
max,1224.0,1.0,1.0,100.0,1.0,100.0,100.0,100.0,1.0,100.0,1.0


In [5]:
features = stockData.columns[2:]
features

Index(['Market Cap', 'Free Float Market Cap %', '6m ADV', 'RoE %', 'RoCE %',
       'EBIT Margin %', 'EPS', 'PAT %', 'Stock Return %'],
      dtype='object')

In [6]:
# Good idea to normalize the features before k-Means
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
stockDataFeatures_scaled = scaler.fit_transform(stockData[features])
stockDataFeatures_scaled = pd.DataFrame(stockDataFeatures_scaled, columns=features)
stockDataFeatures_scaled.describe()

Unnamed: 0,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,0.015214,0.386592,0.027942,0.529061,0.567416,0.538476,0.022321,0.515244,0.145038
std,0.060153,0.166734,0.090669,0.12632,0.079239,0.085363,0.051913,0.087109,0.075296
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000601,0.244506,0.00055,0.506965,0.527079,0.516873,0.003966,0.503271,0.10257
50%,0.001707,0.363985,0.002338,0.544906,0.558323,0.538476,0.009215,0.519677,0.132243
75%,0.006294,0.479731,0.010388,0.585471,0.598389,0.566724,0.020459,0.543242,0.171613
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
kmeans = KMeans(7)

TypeError: __init__() got an unexpected keyword argument 'seed'

In [9]:
clus = kmeans.fit_predict(stockDataFeatures_scaled)

In [10]:
stockData['cluster'] = clus
stockData['cluster'].value_counts()

5    389
2    360
4    212
1    155
3     62
0     28
6     19
Name: cluster, dtype: int64

In [11]:
clusterDesc = pd.DataFrame(stockData.iloc[:,2:].groupby('cluster').mean().round(3))
clusterDesc.insert(0,'size',stockData['cluster'].value_counts())

In [13]:
clusterDesc

Unnamed: 0_level_0,size,Market Cap,Free Float Market Cap %,6m ADV,RoE %,RoCE %,EBIT Margin %,EPS,PAT %,Stock Return %
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,28,0.303,48.722,0.503,60.882,63.02,57.662,0.011,55.212,0.166
1,155,0.008,67.009,0.026,53.874,56.317,54.539,0.036,52.429,0.129
2,360,0.006,24.0,0.008,52.679,54.865,53.035,0.02,51.288,0.149
3,62,0.003,40.55,0.012,13.253,46.03,45.871,0.05,38.27,0.121
4,212,0.019,29.475,0.023,63.223,65.256,60.229,0.006,57.685,0.142
5,389,0.006,44.397,0.019,54.1,56.077,53.804,0.025,51.951,0.153
6,19,0.001,49.133,0.003,27.401,40.063,13.712,0.015,8.962,0.115
