# 1. Preparing Data for Consumption

In [109]:
import sys

import pandas as pd
import matplotlib
import numpy as np
import scipy as sp

import IPython
from IPython import display

import sklearn

import random
import time

import warnings
warnings.filterwarnings('ignore')

In [110]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [111]:
data_raw = pd.read_csv('../data/train.csv')
data_val = pd.read_csv('../data/test.csv')

data1 = data_raw.copy(deep=True)

data_cleaner = [data1, data_val]

data_raw.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C
539,540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22.0,0,2,13568,49.5,B39,C
553,554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22.0,0,0,2620,7.225,,C
527,528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
261,262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
487,488,0,1,"Kent, Mr. Edward Austin",male,58.0,0,0,11771,29.7,B37,C
785,786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25.0,0,0,374887,7.25,,S
741,742,0,1,"Cavendish, Mr. Tyrell William",male,36.0,1,0,19877,78.85,C46,S


In [112]:
print(data1.isnull().sum())
print('-' * 20)
print(data_val.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [113]:
### COMPLETING: Imputing

for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
    
drop_columns = ['PassengerId', 'Cabin', 'Ticket']
data1.drop(drop_columns, axis=1, inplace=True)

print(data1.isnull().sum())
print('-' * 20)
print(data_val.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
--------------------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [114]:
### CREATING: Feature Engineering

for dataset in data_cleaner:
    
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
stat_min = 10

title_names = (data1['Title'].value_counts() < stat_min)

data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

data1['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [115]:
data1.sample(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
878,0,3,"Laleff, Mr. Kristo",male,28.0,0,0,7.8958,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
769,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32.0,0,0,8.3625,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
890,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,7.75,Q,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
170,0,1,"Van der hoef, Mr. Wyckoff",male,61.0,0,0,33.5,S,1,1,Mr,"(31.0, 512.329]","(48.0, 64.0]"
484,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,91.0792,C,2,0,Mr,"(31.0, 512.329]","(16.0, 32.0]"
814,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
644,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,19.2583,C,4,0,Miss,"(14.454, 31.0]","(-0.08, 16.0]"
540,1,1,"Crosby, Miss. Harriet R",female,36.0,0,2,71.0,S,3,0,Miss,"(31.0, 512.329]","(32.0, 48.0]"
154,0,3,"Olsen, Mr. Ole Martin",male,28.0,0,0,7.3125,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
832,0,3,"Saad, Mr. Amin",male,28.0,0,0,7.2292,C,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"


In [121]:
### CONVERTING: Label Encoder

label = LabelEncoder()

for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    
# Target = ['Survived']
    
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
# data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] 
# data1_xy =  Target + data1_x
# print('Original X Y: ', data1_xy, '\n')

# data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
# data1_xy_bin = Target + data1_x_bin
# print('Bin X Y: ', data1_xy_bin, '\n')

data1_dummy = pd.get_dummies(data1[data1_x])
# data1_x_dummy = data1_dummy.columns.tolist()
# data1_xy_dummy = Target + data1_x_dummy
# print('Dummy X Y: ', data1_xy_dummy, '\n')

data1_dummy.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,0,1,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,0,1
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,0,1,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,0,1
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0
