In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing, cross_validation
import pandas as pd
style.use('ggplot')



In [2]:
df = pd.read_excel('http://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls')
# we use the pandas inbuilt method to copy the dataframe because
# original_df = df stores a 'pointer' to df in original_df. ie, modifying df modifies original_df too
original_df = pd.DataFrame.copy(df)
df.drop(['body', 'name'], axis=1, inplace=True)
df.convert_objects(convert_numeric=True)
df.head()

  


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2.0,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"


In [3]:
original_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [7]:
df.fillna(0, inplace=True)

def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new id per unique string
                    text_digit_vals[unique] = x
                    x += 1
            
            # now we map to the new 'id' value to replace the string
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

In [8]:
df = handle_non_numerical_data(df)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,744,211.3375,124,0,1,145
1,1,1,1,0.9167,1,2,509,151.55,24,0,6,297
2,1,0,0,2.0,1,2,509,151.55,24,0,0,297
3,1,0,1,30.0,1,2,509,151.55,24,0,0,297
4,1,0,0,25.0,1,2,509,151.55,24,0,0,297


In [9]:
# Add or remove features just to see the impact they have
df.drop(['ticket', 'home.dest'], axis=1, inplace=True)

In [11]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [14]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group'] = np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[(original_df['cluster_group'] == float(i))]
    survival_cluster = temp_df[(temp_df['survived'] == 1)]
    survival_rate = len(survival_cluster)/len(temp_df)
    survival_rates[i] = survival_rate
    
print(survival_rates)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.36901865369018655, 1: 1.0, 2: 0.7291666666666666, 3: 0.1, 4: 0.0, 5: 1.0, 6: 1.0}


In [16]:
original_df[(original_df['cluster_group'] == 0)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",0.0
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY",0.0
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY",0.0
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,S,,,"Belfast, NI",0.0
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY",0.0
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay",0.0
10,1,0,"Astor, Col. John Jacob",male,47.0000,1,0,PC 17757,227.5250,C62 C64,C,,124.0,"New York, NY",0.0
12,1,1,"Aubart, Mme. Leontine Pauline",female,24.0000,0,0,PC 17477,69.3000,B35,C,9,,"Paris, France",0.0
13,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0000,0,0,19877,78.8500,,S,6,,,0.0
14,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0000,0,0,27042,30.0000,A23,S,B,,"Hessle, Yorks",0.0


In [18]:
original_df[(original_df['cluster_group'] == 0)].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,1233.0,1233.0,981.0,1233.0,1233.0,1232.0,115.0,1233.0
mean,2.343877,0.369019,29.438498,0.429846,0.288727,24.586397,161.469565,0.0
std,0.810942,0.482735,14.258364,0.835583,0.640372,27.77803,98.326642,0.0
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,71.0,0.0
50%,3.0,0.0,28.0,0.0,0.0,13.0,165.0,0.0
75%,3.0,1.0,38.0,1.0,0.0,27.7208,257.0,0.0
max,3.0,1.0,80.0,5.0,4.0,227.525,328.0,0.0


In [19]:
original_df[(original_df['cluster_group'] == 2)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",2.0
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",2.0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",2.0
11,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.525,C62 C64,C,4.0,,"New York, NY",2.0
17,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,6.0,,"Montreal, PQ",2.0
23,1,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C,4.0,,,2.0
24,1,1,"Bird, Miss. Ellen",female,29.0,0,0,PC 17483,221.7792,C97,S,8.0,,,2.0
35,1,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.375,,C,4.0,,"Cooperstown, NY",2.0
54,1,1,"Carter, Master. William Thornton II",male,11.0,1,2,113760,120.0,B96 B98,S,4.0,,"Bryn Mawr, PA",2.0
55,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S,4.0,,"Bryn Mawr, PA",2.0


In [20]:
original_df[(original_df['cluster_group'] == 2)].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,48.0,48.0,39.0,48.0,48.0,48.0,1.0,48.0
mean,1.375,0.729167,30.5,2.229167,1.208333,168.819358,67.0,2.0
std,0.788886,0.449093,13.810275,2.926326,0.874176,72.062133,,0.0
min,1.0,0.0,2.0,0.0,0.0,52.0,67.0,2.0
25%,1.0,0.0,20.0,0.0,0.0,117.720825,67.0,2.0
50%,1.0,1.0,29.0,1.0,1.5,159.1646,67.0,2.0
75%,1.0,1.0,40.0,3.0,2.0,227.525,67.0,2.0
max,3.0,1.0,63.0,8.0,2.0,263.0,67.0,2.0


In [21]:
cluster_0 = original_df[(original_df['cluster_group'] == 0)]
cluster_0_fc = cluster_0[(cluster_0['pclass'] == 1)]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,266.0,266.0,228.0,266.0,266.0,266.0,32.0,266.0
mean,1.0,0.586466,39.817617,0.368421,0.180451,60.369721,167.0625,0.0
std,0.0,0.493395,14.113709,0.520866,0.448639,38.61086,85.153727,0.0
min,1.0,0.0,0.9167,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,29.75,0.0,0.0,29.7,109.75,0.0
50%,1.0,1.0,39.0,0.0,0.0,52.8271,170.5,0.0
75%,1.0,1.0,50.0,1.0,0.0,79.2,236.75,0.0
max,1.0,1.0,80.0,2.0,2.0,227.525,307.0,0.0


In [24]:
cluster_2 = original_df[(original_df['cluster_group'] == 2)]
cluster_2_fc = cluster_2[(cluster_2['pclass'] == 1)]
cluster_2_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,39.0,39.0,38.0,39.0,39.0,39.0,0.0,39.0
mean,1.0,0.897436,30.921053,0.897436,1.025641,191.727672,,2.0
std,0.0,0.307355,13.739628,0.940181,0.873203,59.584571,,0.0
min,1.0,0.0,2.0,0.0,0.0,52.0,,2.0
25%,1.0,1.0,21.5,0.0,0.0,135.6396,,2.0
50%,1.0,1.0,30.0,1.0,1.0,211.3375,,2.0
75%,1.0,1.0,40.0,1.0,2.0,247.5208,,2.0
max,1.0,1.0,63.0,3.0,2.0,263.0,,2.0


In [25]:
cluster_2 = original_df[(original_df['cluster_group'] == 2)]
cluster_2_fc = cluster_2[(cluster_2['pclass'] == 3)]
cluster_2_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,9.0,9.0,1.0,9.0,9.0,9.0,1.0,9.0
mean,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
std,0.0,0.0,,0.0,0.0,0.0,,0.0
min,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
25%,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
50%,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
75%,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
max,3.0,0.0,14.5,8.0,2.0,69.55,67.0,2.0
