In [1]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''

df = pd.read_excel('titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [2]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

In [3]:
original_df['cluster_group']=np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.3680327868852459, 1: 0.9583333333333334, 2: 0.5365853658536586, 3: 1.0, 4: 0.1, 5: 0.5, 6: 0.0}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
print(original_df[ (original_df['cluster_group']==3) ].describe())

       pclass  survived        age  sibsp    parch      fare  body  \
count     4.0       4.0   4.000000    4.0  4.00000    4.0000   0.0   
mean      1.0       1.0  41.000000    0.0  0.50000  512.3292   NaN   
std       0.0       0.0  11.343133    0.0  0.57735    0.0000   NaN   
min       1.0       1.0  35.000000    0.0  0.00000  512.3292   NaN   
25%       1.0       1.0  35.000000    0.0  0.00000  512.3292   NaN   
50%       1.0       1.0  35.500000    0.0  0.50000  512.3292   NaN   
75%       1.0       1.0  41.500000    0.0  1.00000  512.3292   NaN   
max       1.0       1.0  58.000000    0.0  1.00000  512.3292   NaN   

       cluster_group  
count            4.0  
mean             3.0  
std              0.0  
min              3.0  
25%              3.0  
50%              3.0  
75%              3.0  
max              3.0  
