In [1]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection as cross_validation
import pandas as pd
import matplotlib.pyplot as plt


'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''



'\nPclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)\nsurvival Survival (0 = No; 1 = Yes)\nname Name\nsex Sex\nage Age\nsibsp Number of Siblings/Spouses Aboard\nparch Number of Parents/Children Aboard\nticket Ticket Number\nfare Passenger Fare (British pound)\ncabin Cabin\nembarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)\nboat Lifeboat\nbody Body Identification Number\nhome.dest Home/Destination\n'

In [2]:

# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')

pd.options.mode.chained_assignment = None  # default='warn'

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):

    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:

            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them.
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string.
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)
print(df.head(5))


   pclass  survived  sex      age  sibsp  parch      fare  cabin  embarked  \
0       1         1    1  29.0000      0      0  211.3375     90         3   
1       1         1    0   0.9167      1      2  151.5500    154         3   
2       1         0    1   2.0000      1      2  151.5500    154         3   
3       1         0    0  30.0000      1      2  151.5500    154         3   
4       1         0    1  25.0000      1      2  151.5500    154         3   

   boat  
0     1  
1    11  
2     0  
3     0  
4     0  


In [3]:

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_
print(cluster_centers)




[[ 0.39732513 -0.21497097 -0.13892744 -0.22534232 -0.26332445 -0.36669896
  -0.40870685  0.16111848 -0.394854  ]
 [-1.54609786  0.30074929  1.954745    0.48128777  3.60016958  4.43513362
   1.75741229 -0.59661427  0.07179498]
 [-1.54609786  0.64883136  1.08748559 -0.47908676  0.32550889  9.26124543
  -0.1086563  -1.81687688  2.26273126]
 [ 0.84191642  0.30074929 -1.35790158  0.48128777  9.95686385  0.70136971
  -0.45382506  0.62364835 -0.62007963]
 [-1.54609786 -0.74349692  0.63252984 -0.47908676 -0.4449995   9.26124543
   3.01943555 -1.81687688  2.26273126]]


In [4]:


original_df['cluster_group']=np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate

print(survival_rates)
#print(original_df[ (original_df['cluster_group']==1) ])
#print(original_df[ (original_df['cluster_group']==0) ].describe())


{0: 0.3686746987951807, 1: 0.7142857142857143, 2: 1.0, 3: 0.1, 4: 1.0}
