In [4]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection
import pandas as pd
import matplotlib.pyplot as plt


'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''


# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_csv('datasets/titanic.csv')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [6]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group']=np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.3645320197044335, 1: 0.6296296296296297, 2: 1.0, 3: 0.6666666666666666, 4: 0.1, 5: 0.0}


In [8]:
print(original_df[(original_df['cluster_group']==1)])

     pclass  survived                                               name  \
3       1.0       0.0               Allison, Mr. Hudson Joshua Creighton   
16      1.0       0.0                           Baxter, Mr. Quigg Edmond   
17      1.0       1.0    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
23      1.0       1.0                              Bidois, Miss. Rosalie   
57      1.0       1.0          Carter, Mrs. William Ernest (Lucile Polk)   
78      1.0       1.0  Compton, Mrs. Alexander Taylor (Mary Eliza Ing...   
97      1.0       1.0  Douglas, Mrs. Frederick Charles (Mary Helene B...   
103     1.0       1.0                      Endres, Miss. Caroline Louise   
115     1.0       0.0                                  Fortune, Mr. Mark   
116     1.0       1.0                Fortune, Mrs. Mark (Mary McDougald)   
138     1.0       0.0                          Graham, Mr. George Edward   
139     1.0       1.0      Graham, Mrs. William Thompson (Edith Junkins)   
215     1.0 

In [25]:
print(original_df[(original_df['cluster_group']==4)])

      pclass  survived                                               name  \
629      3.0       0.0                        Andersson, Mr. Anders Johan   
632      3.0       0.0  Andersson, Mrs. Anders Johan (Alfrida Konstant...   
644      3.0       0.0         Asplund, Mr. Carl Oscar Vilhelm Gustafsson   
646      3.0       1.0  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...   
831      3.0       0.0                     Goodwin, Mr. Charles Frederick   
832      3.0       0.0            Goodwin, Mrs. Frederick (Augusta Tyler)   
1106     3.0       0.0             Panula, Mrs. Juha (Maria Emilia Ojala)   
1146     3.0       0.0               Rice, Mrs. William (Margaret Norton)   
1179     3.0       0.0                              Sage, Mr. John George   
1180     3.0       0.0                     Sage, Mrs. John (Annie Bullen)   

         sex   age  sibsp  parch    ticket     fare cabin embarked boat  \
629     male  39.0    1.0    5.0    347082  31.2750   NaN        S  NaN   
63