# Clustering using the Mean Shift algorithm

In [3]:
import pandas as pd
titanic_data = pd.read_csv('data/titanic.csv', quotechar='"')
titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 'columns', inplace=True)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Convert the gender values to numbers:

In [5]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
titanic_data['Sex'] = le.fit_transform(titanic_data['Sex'].astype(str))
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [6]:
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'])
titanic_data = titanic_data.dropna()

## Use Mean Shift in order to find clusters in the data

* The bandwidth parameter specifies the "radius" of each cluster
* Higher bandwidths will produce fewer clusters

In [8]:
from sklearn.cluster import MeanShift

analyzer = MeanShift(bandwidth=30)
analyzer.fit(titanic_data)

# Getting the size of the bandwidth which MeanShift will have used by default
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(titanic_data)

30.44675914497196

How many clusters do we have?

In [10]:
labels = analyzer.labels_
import numpy as np
np.unique(labels)

array([0, 1, 2, 3, 4], dtype=int64)

In [12]:
import numpy as np

titanic_data['cluster_group'] = np.nan
data_length = len(titanic_data)
for i in range(data_length):
    titanic_data.iloc[i, titanic_data.columns.get_loc('cluster_group')] = labels[i]
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,1,0,7.25,0,0,1,0.0
1,1,1,0,38.0,1,0,71.2833,1,0,0,1.0
2,1,3,0,26.0,0,0,7.925,0,0,1,0.0
3,1,1,0,35.0,1,0,53.1,0,0,1,1.0
4,0,3,1,35.0,0,0,8.05,0,0,1,0.0


Examine average data for each cluster

In [14]:
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
titanic_cluster_data['Counts'] = pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.336918,2.52509,0.679211,28.25672,0.439068,0.370968,15.434139,0.121864,0.046595,0.831541,558
1.0,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963,108
2.0,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5,30
3.0,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667,15
4.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0,3
