In [1]:
import string
import numpy as np
import pandas as pd
%config IPCompleter.greedy=True

In [2]:
def substrings_in_string(big_string, substrings):
	for substring in substrings:
		if string.find(big_string, substring) != -1:
			return substring

def replace_titles(x):
	title = x['Title']
	if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
		return 'Mr'
	elif title in ['Countess', 'Mme']:
		return 'Mrs'
	elif title in ['Mlle', 'Ms']:
		return 'Miss'
	elif title =='Dr':
		if x['Sex']=='Male':
			return 'Mr'
		else:
			return 'Mrs'
	else:
		return title


In [3]:
train_set = pd.read_csv("datasets/train.csv")


In [4]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

train_set['Title'] = train_set["Name"].map(lambda x: substrings_in_string(x, title_list))
train_set['Title'] = train_set.apply(replace_titles, axis =  1)

# Turn cabin into deck

train_set.Cabin = train_set.Cabin.fillna('Unknown')    
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train_set['Deck']=train_set['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

train_set[['Sex','Embarked','Title','Deck']].info()
#Embarked has 2 null

train_set.loc[pd.isnull(train_set["Embarked"])]
train_set.drop(['Name', 'Ticket','Cabin'], axis =1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Sex         891 non-null object
Embarked    891 non-null object
Title       891 non-null object
Deck        891 non-null object
dtypes: object(4)
memory usage: 27.9+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck
0,1,0,3,male,22.0,1,0,7.2500,S,Mr,Unknown
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs,C
2,3,1,3,female,26.0,0,0,7.9250,S,Miss,Unknown
3,4,1,1,female,35.0,1,0,53.1000,S,Mrs,C
4,5,0,3,male,35.0,0,0,8.0500,S,Mr,Unknown
5,6,0,3,male,,0,0,8.4583,Q,Mr,Unknown
6,7,0,1,male,54.0,0,0,51.8625,S,Mr,E
7,8,0,3,male,2.0,3,1,21.0750,S,Master,Unknown
8,9,1,3,female,27.0,0,2,11.1333,S,Mrs,Unknown
9,10,1,2,female,14.0,1,0,30.0708,C,Mrs,Unknown


In [5]:
from sklearn.preprocessing import LabelEncoder

le_sex = LabelEncoder()
sex_numerical = le_sex.fit_transform(train_set['Sex'])
sex_numerical_classes = le_sex.classes_

le_title = LabelEncoder()
title_numerical = le_title.fit_transform(train_set['Title'])
title_numerical_classes = le_title.classes_

le_deck = LabelEncoder()
deck_numerical = le_deck.fit_transform(train_set['Deck'])
deck_numerical_classes = le_deck.classes_

print('Classes of Sex feature:\n{}\n{}'.format(
        np.arange(len(sex_numerical_classes)), sex_numerical_classes))
print('')
print('Classes of Title feature:\n{}\n{}'.format(
        np.arange(len(title_numerical_classes)), title_numerical_classes))
print('')
print('Classes of Cabin_cat feature:\n{}\n{}'.format(
        np.arange(len(deck_numerical_classes)), deck_numerical_classes))

Classes of Sex feature:
[0 1]
['female' 'male']

Classes of Title feature:
[0 1 2 3]
['Master' 'Miss' 'Mr' 'Mrs']

Classes of Cabin_cat feature:
[0 1 2 3 4 5 6 7 8]
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T' 'Unknown']


In [6]:
from sklearn.preprocessing import OneHotEncoder

# Sex feature
enc_sex = OneHotEncoder(sparse=False)
sex_onehot = enc_sex.fit_transform(sex_numerical.reshape(-1,1))

# Title feature
enc_title = OneHotEncoder(sparse=False)
title_onehot = enc_title.fit_transform(title_numerical.reshape(-1,1))

# Cabin_cat feature
enc_deck = OneHotEncoder(sparse=False)
deck_onehot = enc_deck.fit_transform(deck_numerical.reshape(-1,1))


In [7]:
def pdAssignWithOHLabel(df, column, onehot_labeled, class_labels):
    to_assign = {}
    for c_idx, label in enumerate(class_labels):
        to_assign[column+'_'+label] = onehot_labeled[:,c_idx]
    df = df.assign(**to_assign)
    return df

In [8]:
train_set = pdAssignWithOHLabel(train_set, 'Sex', 
                                 sex_onehot, sex_numerical_classes)
train_set = train_set.drop("Sex",axis=1)

train_set = pdAssignWithOHLabel(train_set, 'Title', 
                                 title_onehot, title_numerical_classes)
train_set = train_set.drop("Title",axis=1)
train_set = pdAssignWithOHLabel(train_set, 'Deck', 
                                 deck_onehot, deck_numerical_classes)
train_set = train_set.drop("Deck",axis=1)

train_set = train_set.drop(["Name","Ticket","Cabin"],axis=1)
train_set.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male,...,Title_Mrs,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,1,0,3,22.0,1,0,7.25,S,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,C,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.925,S,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1,S,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,35.0,0,0,8.05,S,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
mu = train_set['Fare'].mean()
sd = train_set['Fare'].std()

row_mask = train_set['Fare']>mu+5*sd
train_set.set_value(row_mask, 'Fare', mu+5*sd);

  """


In [10]:
from sklearn.preprocessing import StandardScaler
sc_tmp = StandardScaler()
tmp_scaled = train_set.copy().drop(['Embarked','Age','Survived'], axis=1) # create a copy of the data
tmp_scaled = pd.DataFrame(sc_tmp.fit_transform(tmp_scaled),columns=tmp_scaled.columns, index=tmp_scaled.index)

# Add the non-scaled features to this temporary DataFrame
tmp_scaled = tmp_scaled.assign(Survived=train_set['Survived'])
tmp_scaled = tmp_scaled.assign(Embarked=train_set['Embarked'])
tmp_scaled = tmp_scaled.assign(Age=train_set['Age'])

In [11]:
from sklearn.neighbors import KDTree
tmp = tmp_scaled.copy().drop(['Survived','Age','Embarked'], axis=1).values
row_idx = pd.isnull(train_set['Embarked'])
tree = KDTree(tmp)
dist, ind = tree.query(tmp[[62, 830]], k=6) 
# The k nearest neighbors include the passenger itself, 
# so we specify k=6 to get the 5 nearest neighbors
for i in ind:
    print('5 closest neigbors to passenger {} and their values for Embarked:\n{}\n'\
          .format(i[0], train_set['Embarked'].loc[i[1:]]))

5 closest neigbors to passenger 62 and their values for Embarked:
224    S
137    S
336    S
110    S
55     S
Name: Embarked, dtype: object

5 closest neigbors to passenger 830 and their values for Embarked:
617    S
797    S
578    C
559    S
874    C
Name: Embarked, dtype: object



In [12]:
#Label Encoder
le_embarked = LabelEncoder()
embarked_numerical = le_embarked.fit_transform(train_set['Embarked'])
embarked_numerical_classes = le_embarked.classes_
print('Classes of Embarked feature:\n{}\n{}'.format(
        np.arange(len(embarked_numerical_classes)), 
        embarked_numerical_classes))
#One hot encoder
enc_embarked = OneHotEncoder(sparse=False)
embarked_onehot = enc_embarked.fit_transform(embarked_numerical.reshape(-1,1))


train_set = pdAssignWithOHLabel(train_set, 'Embarked', 
                                 embarked_onehot, embarked_numerical_classes)

tmp_scaled = pdAssignWithOHLabel(tmp_scaled, 'Embarked', embarked_onehot, 
                                 embarked_numerical_classes)
train_set = train_set.drop("Embarked",axis=1)


Classes of Embarked feature:
[0 1 2]
['C' 'Q' 'S']


In [13]:
sc_tmp = StandardScaler()
tmp = tmp_scaled[['Embarked_C', 'Embarked_Q', 'Embarked_S']].copy()
tmp = pd.DataFrame(sc_tmp.fit_transform(tmp),columns=tmp.columns, index=tmp.index)

# Drop the unscaled features from train_data 
tmp_scaled = tmp_scaled.drop(['Embarked_C', 'Embarked_Q', 'Embarked_S'], 
                             axis=1)

# Assign the scaled features to train_data
tmp_scaled = tmp_scaled.assign(Embarked_C=tmp['Embarked_C'])
tmp_scaled = tmp_scaled.assign(Embarked_Q=tmp['Embarked_Q'])
tmp_scaled = tmp_scaled.assign(Embarked_S=tmp['Embarked_S'])

In [14]:
def knnImpute(ori_arr, tmp_imp_arr, feature, k=6): # improved one
    from sklearn.neighbors import KDTree
    row_idx = ori_arr[pd.isnull(ori_arr[feature])].index.tolist()
    tree = KDTree(tmp_imp_arr) # tmp_arr is the array without 
                           # the null-containing feature
    #row_idx = np.add(row_idx, -1)
    for nan_v in row_idx:
        # Uncomment print statements below to get some more insight
        print('Passenger: {}'.format(nan_v))
        dist, ind = tree.query(tmp_imp_arr[nan_v,:].reshape(1,-1), k)
        print(tmp_imp_arr[nan_v,:])
        print(ind[0])
        nn_vals = ori_arr[feature].loc[ind[0][1:]]
        imp_val = np.floor(np.nanmean(nn_vals))+0.5 
        # Per the documentation on this Kaggle data set, estimated
        # 'Age' values are of the form x.5
        
        print('{} closest neigbors to passenger {} and their values for ' \
        'Age:\n{}\n'.format(k-1, nan_v, ori_arr[feature].loc[ind[0]]))
        print('Imputed value would be {}\n'.format(imp_val))
        ori_arr.set_value(nan_v, feature, imp_val)
    return ori_arr

In [15]:
tmp_imp = tmp_scaled.copy().drop(['Age', 'Embarked'], axis = 1).values
train_set = knnImpute(train_set, tmp_imp, 'Age', 8)
print('New number of null values in "Age" column: {}'.format(
    pd.isnull(train_set['Age']).sum()))

Passenger: 5
[-1.71066854  0.82737724 -0.4745452  -0.47367361 -0.52718504 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715343
 -0.03352008  0.54492498  0.         -0.48204268  3.25137334 -1.62380254]
[  5 116 126 143 196  46 260 280]
7 closest neigbors to passenger 5 and their values for Age:
5       NaN
116    70.5
126     NaN
143    19.0
196     NaN
46      NaN
260     NaN
280    65.0
Name: Age, dtype: float64

Imputed value would be 51.5

Passenger: 17
[-1.66401395 -0.36936484 -0.4745452  -0.47367361 -0.42292983 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715343
 -0.03352008  0.54492498  1.         -0.48204268 -0.30756234  0.61583843]
[ 17 226  33  70  20 288 134 144]
7 closest neigbors to passenger 17 and their values for Age:
17      NaN
226    19.0
33     66.0
70     32.



[-0.55985516  0.82737724  1.34013193 -0.47367361 -0.18763998 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715343
 -0.03352008  0.54492498  1.         -0.48204268  3.25137334 -1.62380254]
[301 364 214  46 188 510 280 260]
7 closest neigbors to passenger 301 and their values for Age:
301     NaN
364     NaN
214    46.5
46     47.5
188    40.0
510    29.0
280    65.0
260    51.5
Name: Age, dtype: float64

Imputed value would be 46.5

Passenger: 303
[-0.55207939 -0.36936484 -0.4745452  -0.47367361 -0.43785065  1.35557354
 -1.35557354 -0.21680296  1.97372855 -1.21449578 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614  5.09901951 -0.11684125 -0.06715343
 -0.03352008 -1.835115    1.         -0.48204268  3.25137334 -1.62380254]
[303 123 717 356 309 128 337 585]
7 closest neigbors to passenger 303 and their values for Age:
303     NaN
123    32.5
717    27.0
356    22.0
309    

7 closest neigbors to passenger 578 and their values for Age:
578     NaN
830    15.0
367    26.5
362    45.0
874    28.0
19     29.5
9      14.0
513    54.0
Name: Age, dtype: float64

Imputed value would be 30.5

Passenger: 584
[ 0.54041574  0.82737724 -0.4745452  -0.47367361 -0.52134985 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715343
 -0.03352008  0.54492498  0.          2.0745051  -0.30756234 -1.62380254]
[584 598 568 531 524 522 661 495]
7 closest neigbors to passenger 584 and their values for Age:
584     NaN
598     NaN
568    27.5
531    24.5
524    24.5
522    24.5
661    40.0
495    24.5
Name: Age, dtype: float64

Imputed value would be 27.5

Passenger: 589
[ 0.55985516  0.82737724 -0.4745452  -0.47367361 -0.53655761 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715

7 closest neigbors to passenger 815 and their values for Age:
815     NaN
872    33.0
536    45.0
671    31.0
690    31.0
263    40.0
745    70.0
170    61.0
Name: Age, dtype: float64

Imputed value would be 44.5

Passenger: 825
[ 1.47739556  0.82737724 -0.4745452  -0.47367361 -0.56180822 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715343
 -0.03352008  0.54492498  0.         -0.48204268  3.25137334 -1.62380254]
[825 790 778 890 749 718 703 629]
7 closest neigbors to passenger 825 and their values for Age:
825     NaN
790    30.5
778    30.5
890    32.0
749    31.0
718    31.5
703    25.0
629    34.5
Name: Age, dtype: float64

Imputed value would be 30.5

Passenger: 826
[ 1.48128344  0.82737724 -0.4745452  -0.47367361  0.57552091 -0.73769513
  0.73769513 -0.21680296 -0.50665528  0.82338697 -0.42809687 -0.13085598
 -0.23598136 -0.26629582 -0.19611614 -0.19611614 -0.11684125 -0.06715

In [16]:
from sklearn.decomposition import PCA

sc_training = StandardScaler()
tmp = train_set.copy().drop(['Survived'], axis = 1).values

sc_training = sc_training.fit(tmp)

train_no_surv = train_set.copy().drop('Survived', axis=1)
train_no_surv = sc_training.transform(train_no_surv)

pca = PCA()
pca.fit(train_no_surv)

t = pca.transform(train_no_surv)

died = t[np.array(train_set['Survived']==0),:]
survived = t[np.array(train_set['Survived']==1),:]
components = [0,1]

from matplotlib import pyplot as pp
pp.scatter(died[:,components[0]].reshape(1,-1), died[:,components[1]].reshape(1,-1), color='red', alpha=.5, label='Died')
pp.scatter(survived[:,components[0]].reshape(1,-1), survived[:,components[1]].reshape(1,-1), color='green', alpha=.5, label='Survived')
pp.legend(loc='best', shadow=False, scatterpoints=1)
pp.title('PCA of train_no_surv')
pp.xlabel('Principal component {}'.format(components[0]+1))
pp.ylabel('Principal component {}'.format(components[1]+1))
pp.show()

<matplotlib.figure.Figure at 0xad647f0>

In [17]:
columns = ['PassengerId',  'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [18]:
train_set_df = train_set
train_labels = train_set['Survived'].values.ravel()
train_set = train_set[columns].values

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

overall_res = pd.DataFrame(columns=['Classifier', 'Best_clf_retrained', 'Best_test_score'])

In [19]:
# NAIVE BAYES

from sklearn.naive_bayes import GaussianNB
clf_name = 'GaussianNB'

sc = StandardScaler()
clf = GaussianNB()

pipeline = Pipeline([('scaler',sc),('clf',clf)])
fit_params = {}

gs = GridSearchCV(pipeline, fit_params, cv = 7, n_jobs =-1)
gs.fit(train_set,train_labels)

cv = pd.DataFrame(gs.cv_results_)

overall_res = overall_res.append({'Classifier': clf_name,'Best_clf_retrained': gs.best_estimator_.fit(train_set, train_labels),'Best_test_score':gs.best_score_},ignore_index=True);
print('Best score in CV fitting: {}'.format(gs.best_score_))

Best score in CV fitting: 0.727272727273




In [20]:
# Desicion Tree clf

from sklearn.tree import DecisionTreeClassifier
clf_name = 'DecisionTree'

sc = StandardScaler()
clf = DecisionTreeClassifier()
pipeline = Pipeline([('scaler',sc),('clf',clf)])

fit_params = {'clf__criterion':['gini', 'entropy'],'clf__min_samples_split':[2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32],
    'clf__min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}

gs = GridSearchCV(pipeline, fit_params, cv=7, n_jobs=-1)
# Fit GridSearchCV
gs.fit(train_set, train_labels)

cv = pd.DataFrame(gs.cv_results_) # To check all the classifier performances

overall_res = overall_res.append({'Classifier': clf_name, 'Best_clf_retrained': 
                            gs.best_estimator_.fit(train_set, train_labels), 
                            'Best_test_score': gs.best_score_}, ignore_index=True);
print('Best performing classifier parameters (score {}):\n{}'.format(gs.best_score_,gs.best_params_))

Best performing classifier parameters (score 0.828282828283):
{'clf__criterion': 'gini', 'clf__min_samples_leaf': 9, 'clf__min_samples_split': 26}


In [21]:
from sklearn.svm import SVC
clf_name = 'SVM'

sc = StandardScaler()
clf = SVC()
# Make pipeline
pipeline = Pipeline([('scaler',sc),('clf',clf)])
fit_params = {'clf__kernel':['rbf','linear', 'poly'],'clf__degree':[2, 3, 4, 5]}
# Initiate GridSearchCV
gs = GridSearchCV(pipeline, fit_params, cv=7, n_jobs=-1)

gs.fit(train_set, train_labels)

cv = pd.DataFrame(gs.cv_results_) # To check all the classifier performances

overall_res = overall_res.append({'Classifier': clf_name, 'Best_clf_retrained': 
                            gs.best_estimator_.fit(train_set, train_labels), 
                            'Best_test_score': gs.best_score_}, ignore_index=True);
print('Best performing classifier parameters (score {}):\n{}'.format(gs.best_score_,gs.best_params_))

Best performing classifier parameters (score 0.828282828283):
{'clf__degree': 2, 'clf__kernel': 'rbf'}


In [26]:
from sklearn.neighbors import KNeighborsClassifier as KNC

clf_name = 'KNN'

sc = StandardScaler()
clf = KNC()
# Make pipeline
pipeline = Pipeline([('sc', sc),('clf', clf)])
fit_params = {'clf__n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,20,25,30,40],
                 'clf__weights':['uniform','distance'],
                 'clf__p':[1, 2]}
# Initiate GridSearchCV
gs = GridSearchCV(pipeline, fit_params, cv=7, n_jobs=-1)
# Fit GridSearchCV
gs.fit(train_set, train_labels)

cv = pd.DataFrame(gs.cv_results_) # To check all the classifier performances

overall_res = overall_res.append({'Classifier': clf_name, 'Best_clf_retrained': 
                            gs.best_estimator_.fit(train_set, train_labels), 
                            'Best_test_score': gs.best_score_}, ignore_index=True);
print('Best performing classifier parameters (score {}):\n{}'.format(gs.best_score_,gs.best_params_))

Best performing classifier parameters (score 0.833894500561):
{'clf__weights': 'uniform', 'clf__p': 1, 'clf__n_neighbors': 10}


In [None]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier as RFC
clf_name = 'RandomForest'

sc = StandardScaler()
clf = RFC()
# Make pipeline
pipeline = Pipeline([('sc', sc),('clf', clf)])
fit_params = {'clf__n_estimators':[100],
              'clf__min_samples_split':[2,3,4,5,6,7,8,9,10,11,12,13,16],
              'clf__min_samples_leaf':[1,2,3,4,5,6,7],
              'clf__max_features':[None,'auto']}
# Initiate GridSearchCV
gs = GridSearchCV(pipeline, fit_params, cv=7, n_jobs=-1)
# Fit GridSearchCV
gs.fit(train_set, train_labels)

cv = pd.DataFrame(gs.cv_results_) # To check all the classifier performances

overall_res = overall_res.append({'Classifier': clf_name, 'Best_clf_retrained': 
                            gs.best_estimator_.fit(train_set, train_labels), 
                            'Best_test_score': gs.best_score_}, ignore_index=True);
print('Best performing classifier parameters (score {}):\n{}'.format(gs.best_score_,gs.best_params_))

In [None]:
# AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier as ABC
clf_name = 'AdaBoost'

sc = StandardScaler()
clf = ABC()
# Make pipeline
pipeline = Pipeline([('sc', sc),('clf', clf)])