In [1]:
import string
import numpy as np
import pandas as pd
%config IPCompleter.greedy=True

In [2]:
def substrings_in_string(big_string, substrings):
	for substring in substrings:
		if string.find(big_string, substring) != -1:
			return substring

def replace_titles(x):
	title = x['Title']
	if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
		return 'Mr'
	elif title in ['Countess', 'Mme']:
		return 'Mrs'
	elif title in ['Mlle', 'Ms']:
		return 'Miss'
	elif title =='Dr':
		if x['Sex']=='Male':
			return 'Mr'
		else:
			return 'Mrs'
	else:
		return title


In [3]:
train_set = pd.read_csv("datasets/train.csv")


In [4]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

train_set['Title'] = train_set["Name"].map(lambda x: substrings_in_string(x, title_list))
train_set['Title'] = train_set.apply(replace_titles, axis =  1)

# Turn cabin into deck

train_set.Cabin = train_set.Cabin.fillna('Unknown')    
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train_set['Deck']=train_set['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

train_set[['Sex','Embarked','Title','Deck']].info()
#Embarked has 2 null

train_set.loc[pd.isnull(train_set["Embarked"])]
train_set.drop(['Name', 'Ticket','Cabin'], axis =1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Sex         891 non-null object
Embarked    891 non-null object
Title       891 non-null object
Deck        891 non-null object
dtypes: object(4)
memory usage: 27.9+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck
0,1,0,3,male,22.0,1,0,7.2500,S,Mr,Unknown
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs,C
2,3,1,3,female,26.0,0,0,7.9250,S,Miss,Unknown
3,4,1,1,female,35.0,1,0,53.1000,S,Mrs,C
4,5,0,3,male,35.0,0,0,8.0500,S,Mr,Unknown
5,6,0,3,male,,0,0,8.4583,Q,Mr,Unknown
6,7,0,1,male,54.0,0,0,51.8625,S,Mr,E
7,8,0,3,male,2.0,3,1,21.0750,S,Master,Unknown
8,9,1,3,female,27.0,0,2,11.1333,S,Mrs,Unknown
9,10,1,2,female,14.0,1,0,30.0708,C,Mrs,Unknown


In [5]:
from sklearn.preprocessing import LabelEncoder

le_sex = LabelEncoder()
sex_numerical = le_sex.fit_transform(train_set['Sex'])
sex_numerical_classes = le_sex.classes_

le_title = LabelEncoder()
title_numerical = le_title.fit_transform(train_set['Title'])
title_numerical_classes = le_title.classes_

le_deck = LabelEncoder()
deck_numerical = le_deck.fit_transform(train_set['Deck'])
deck_numerical_classes = le_deck.classes_

print('Classes of Sex feature:\n{}\n{}'.format(
        np.arange(len(sex_numerical_classes)), sex_numerical_classes))
print('')
print('Classes of Title feature:\n{}\n{}'.format(
        np.arange(len(title_numerical_classes)), title_numerical_classes))
print('')
print('Classes of Cabin_cat feature:\n{}\n{}'.format(
        np.arange(len(deck_numerical_classes)), deck_numerical_classes))

Classes of Sex feature:
[0 1]
['female' 'male']

Classes of Title feature:
[0 1 2 3]
['Master' 'Miss' 'Mr' 'Mrs']

Classes of Cabin_cat feature:
[0 1 2 3 4 5 6 7 8]
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T' 'Unknown']


In [6]:
from sklearn.preprocessing import OneHotEncoder

# Sex feature
enc_sex = OneHotEncoder(sparse=False)
sex_onehot = enc_sex.fit_transform(sex_numerical.reshape(-1,1))

# Title feature
enc_title = OneHotEncoder(sparse=False)
title_onehot = enc_title.fit_transform(title_numerical.reshape(-1,1))

# Cabin_cat feature
enc_deck = OneHotEncoder(sparse=False)
deck_onehot = enc_deck.fit_transform(deck_numerical.reshape(-1,1))


In [7]:
def pdAssignWithOHLabel(df, column, onehot_labeled, class_labels):
    to_assign = {}
    for c_idx, label in enumerate(class_labels):
        to_assign[column+'_'+label] = onehot_labeled[:,c_idx]
    df = df.assign(**to_assign)
    return df

In [8]:
train_set = pdAssignWithOHLabel(train_set, 'Sex', 
                                 sex_onehot, sex_numerical_classes)
train_set = train_set.drop("Sex",axis=1)

train_set = pdAssignWithOHLabel(train_set, 'Title', 
                                 title_onehot, title_numerical_classes)
train_set = train_set.drop("Title",axis=1)
train_set = pdAssignWithOHLabel(train_set, 'Deck', 
                                 deck_onehot, deck_numerical_classes)
train_set = train_set.drop("Deck",axis=1)

train_set = train_set.drop(["Name","Ticket","Cabin"],axis=1)
train_set.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male,...,Title_Mrs,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,1,0,3,22.0,1,0,7.25,S,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,C,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.925,S,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1,S,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,35.0,0,0,8.05,S,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
mu = train_set['Fare'].mean()
sd = train_set['Fare'].std()

row_mask = train_set['Fare']>mu+5*sd
train_set.set_value(row_mask, 'Fare', mu+5*sd);

  """


In [10]:
from sklearn.preprocessing import StandardScaler
sc_tmp = StandardScaler()
tmp_scaled = train_set.copy().drop(['Embarked','Age','Survived'], axis=1) # create a copy of the data
tmp_scaled = pd.DataFrame(sc_tmp.fit_transform(tmp_scaled),columns=tmp_scaled.columns, index=tmp_scaled.index)

# Add the non-scaled features to this temporary DataFrame
tmp_scaled = tmp_scaled.assign(Survived=train_set['Survived'])
tmp_scaled = tmp_scaled.assign(Embarked=train_set['Embarked'])
tmp_scaled = tmp_scaled.assign(Age=train_set['Age'])

In [11]:
from sklearn.neighbors import KDTree
tmp = tmp_scaled.copy().drop(['Survived','Age','Embarked'], axis=1).values
row_idx = pd.isnull(train_set['Embarked'])
tree = KDTree(tmp)
dist, ind = tree.query(tmp[[62, 830]], k=6) 
# The k nearest neighbors include the passenger itself, 
# so we specify k=6 to get the 5 nearest neighbors
for i in ind:
    print('5 closest neigbors to passenger {} and their values for Embarked:\n{}\n'\
          .format(i[0], train_set['Embarked'].loc[i[1:]]))

5 closest neigbors to passenger 62 and their values for Embarked:
224    S
137    S
336    S
110    S
55     S
Name: Embarked, dtype: object

5 closest neigbors to passenger 830 and their values for Embarked:
617    S
797    S
578    C
559    S
874    C
Name: Embarked, dtype: object



In [12]:
#Label Encoder
le_embarked = LabelEncoder()
embarked_numerical = le_embarked.fit_transform(train_set['Embarked'])
embarked_numerical_classes = le_embarked.classes_
print('Classes of Embarked feature:\n{}\n{}'.format(
        np.arange(len(embarked_numerical_classes)), 
        embarked_numerical_classes))
#One hot encoder
enc_embarked = OneHotEncoder(sparse=False)
embarked_onehot = enc_embarked.fit_transform(embarked_numerical.reshape(-1,1))


train_set = pdAssignWithOHLabel(train_set, 'Embarked', 
                                 embarked_onehot, embarked_numerical_classes)

tmp_scaled = pdAssignWithOHLabel(tmp_scaled, 'Embarked', embarked_onehot, 
                                 embarked_numerical_classes)
train_set = train_set.drop("Embarked",axis=1)


Classes of Embarked feature:
[0 1 2]
['C' 'Q' 'S']


In [13]:
sc_tmp = StandardScaler()
tmp = tmp_scaled[['Embarked_C', 'Embarked_Q', 'Embarked_S']].copy()
tmp = pd.DataFrame(sc_tmp.fit_transform(tmp),columns=tmp.columns, index=tmp.index)

# Drop the unscaled features from train_data 
tmp_scaled = tmp_scaled.drop(['Embarked_C', 'Embarked_Q', 'Embarked_S'], 
                             axis=1)

# Assign the scaled features to train_data
tmp_scaled = tmp_scaled.assign(Embarked_C=tmp['Embarked_C'])
tmp_scaled = tmp_scaled.assign(Embarked_Q=tmp['Embarked_Q'])
tmp_scaled = tmp_scaled.assign(Embarked_S=tmp['Embarked_S'])