In [1]:
import numpy as np
import pandas as pd
import string 
from patsy import dmatrices
from operator import itemgetter
#model 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, GradientBoostingRegressor 
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import xgboost as xgb
#evaluation metrics 
from sklearn.metrics import classification_report
from sklearn.externals import joblib
#visulisation 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 



## 1. Read Files

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
test_df['Survived'] = 0
all_df = train_df.append(test_df)

In [110]:
test_df.shape

(418, 12)

## 2. Feature Engineering - category features

Transform the category features 
1. Fare
2. Pclass
3. Family Size
4. Age group
5. Name length

In [6]:
def drop_col_not_req(df, cols):
    df.drop(cols, axis = 1, inplace = True)

def fare_category(fare):
    if (fare <= 4):
        return 'Very_Low_Fare'
    elif (fare <= 10):
        return 'Low_Fare'
    elif (fare <= 30):
        return 'Med_Fare'
    elif (fare <= 45):
        return 'High_Fare'
    else:
        return 'Very_High_Fare'

def pclass_fare_category(df, Pclass_1_mean_fare, Pclass_2_mean_fare, Pclass_3_mean_fare):
    if (df['Pclass'] == 1):
        if (df['Fare'] <= Pclass_1_mean_fare):
            return 'Pclass_1_Low_Fare'
        else:
            return 'Pclass_1_High_Fare'
    elif (df['Pclass'] == 2):
        if (df['Fare'] <= Pclass_2_mean_fare):
            return 'Pclass_2_Low_Fare'
        else:
            return 'Pclass_2_High_Fare'
    elif (df['Pclass'] == 3):
        if (df['Fare'] <= Pclass_3_mean_fare):
            return 'Pclass_3_Low_Fare'
        else:
            return 'Pclass_3_High_Fare'

def family_size_category(family_size):
    if (family_size <= 1):
        return 'Single'
    elif (family_size <= 3):
        return 'Small_Family'
    else:
        return 'Large_Family'

def age_group_cat(age):
    if (age <= 1):
        return 'Baby'
    if (age <= 4):
        return 'Toddler'
    elif(age <= 12):
        return 'Child'
    elif (age <= 19):
        return 'Teenager'
    elif (age <= 30):
        return 'Adult'
    elif (age <= 50):
        return 'Middle_Aged'
    elif(age < 60):
        return 'Senior_Citizen'
    else:
        return 'Old'

def name_len_category(name_len):
    if (name_len <= 19):
        return 'Very_Short_Name'
    elif (name_len <= 28):
        return 'Short_Name'
    elif (name_len <= 45):
        return 'Medium_Name'
    else:
        return 'Long_Name'

## 3. Handling missing values

Use **GradientBoostingRegressor** and **LinearRegression** to fill the missing value 

In [7]:
def fill_missing_age(missing_age_train, missing_age_test):
    missing_age_X_train = missing_age_train.drop(['Age'], axis = 1)
    missing_age_y_train = missing_age_train['Age']
    missing_age_X_test = missing_age_test.drop(['Age'], axis = 1)
    
    #gridsearch for best parameters fit for GradientBoostingRegressor
    gbm_reg = GradientBoostingRegressor(random_state = 42)
    gbm_reg_param_grid = {'n_estimators': [2000], 'max_depth': [3], 'learning_rate': [0.01], 'max_features': [3]}
    gbm_reg_grid = GridSearchCV(gbm_reg, gbm_reg_param_grid, cv = 10, n_jobs = 25, verbose = 1, scoring = 'neg_mean_squared_error')
    gbm_reg_grid.fit(missing_age_X_train, missing_age_y_train)
    
    print("Age feature Best GB Params: " + str(gbm_reg_grid.best_params_))
    print("Age feature Best GB Score: " + str(gbm_reg_grid.best_score_))
    print("GB Train Error for 'Age' Feature Regressor: " + str(gbm_reg_grid.score(missing_age_X_train, missing_age_y_train)))
    
    missing_age_test['Age_GB'] = gbm_reg_grid.predict(missing_age_X_test)
    print(missing_age_test['Age_GB'][:4])
    
    #gridsearch for best parameters fit for LinearRegression
    lrf_reg = LinearRegression()
    lrf_reg_param_grid = {'fit_intercept': [True], 'normalize': [True]}
    lrf_reg_grid = GridSearchCV(lrf_reg, lrf_reg_param_grid, cv = 10, n_jobs = 25, verbose = 1, scoring = 'neg_mean_squared_error')
    lrf_reg_grid.fit(missing_age_X_train, missing_age_y_train)
    
    print("Age feature Best LR Params: " + str(lrf_reg_grid.best_params_))
    print("Age feature Best LR Score: " + str(lrf_reg_grid.best_score_))
    print("LR Train Error for 'Age' Feature Regressor: " + str(lrf_reg_grid.score(missing_age_X_train, missing_age_y_train)))
    
    missing_age_test['Age_LRF'] = lrf_reg_grid.predict(missing_age_X_test)
    print(missing_age_test['Age_LRF'][:4])
    
    missing_age_test['Age'] = missing_age_test[['Age_GB', 'Age_LRF']].mean(axis = 1)

    print(missing_age_test['Age'][:4])
    drop_col_not_req(missing_age_test, ['Age_GB', 'Age_LRF'])

    return missing_age_test

## 4. Pick top 'N' features 

Pick up top 'N' features in different ensemble models (**RandomForestClassifier**, **AdaBoostClassifier**, and **ExtraTreesClassifier**)

In [57]:
def get_top_n_features(titanic_train_data_X, titanic_train_data_y, top_n_features):
    #random forest 
    rf_est = RandomForestClassifier(random_state = 42)
    rf_param_grid = {'n_estimators' : [500], 'min_samples_split':[2, 3], 'max_depth':[20]}
    rf_grid =GridSearchCV(rf_est, rf_param_grid, n_jobs = 25, cv = 10, verbose = 1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_y)
    
    print("Top N Features Best RF Params: " + str(rf_grid.best_params_))
    print("Top N Features Best RF Score: " + str(rf_grid.best_score_))
    print("Top N Features RF Train Error: " + str(rf_grid.score(titanic_train_data_X, titanic_train_data_y)))

    feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X), 
                                          'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending = False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print("Sample 25 Features from RF Classifier:")
    print(str(features_top_n_rf[:25]))
    
    #ada boost 
    ada_est = AdaBoostClassifier(random_state = 42)
    ada_param_grid = {'n_estimators' : [500], 'learning_rate': [0.5, 0.6]}
    ada_grid = GridSearchCV(ada_est, ada_param_grid, n_jobs = 25, cv = 10, verbose = 1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_y)
    
    print("Top N Features Best Ada Params: " + str(ada_grid.best_params_))
    print("Top N Features Best Ada Score: " + str(ada_grid.best_score_))
    print("Top N Features Ada Train Error: " + str(ada_grid.score(titanic_train_data_X, titanic_train_data_y)))
    
    feature_imp_sorted_ada = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending = False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print("Sample 25 Features from Ada Classifier:")
    print(str(features_top_n_ada[:25]))
    
    #extra tree 
    et_est = ExtraTreesClassifier(random_state = 42)
    et_param_grid = {'n_estimators' : [500], 'min_samples_split':[3, 4], 'max_depth':[15]}
    et_grid = GridSearchCV(et_est, et_param_grid, n_jobs = 25, cv = 10, verbose = 1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_y)
    
    print("Top N Features Best ET Params: " + str(et_grid.best_params_))
    print("Top N Features Best ET Score: " + str(et_grid.best_score_))
    print("Top N Features ET Train Error: " + str(et_grid.score(titanic_train_data_X, titanic_train_data_y)))
    
    feature_imp_sorted_et = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending = False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print("Sample 25 Features from ET Classifier:")
    print(str(features_top_n_et[:25]))
    
    #### Merge top_n_features from all three models
    features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et], ignore_index = True).drop_duplicates()
    
    return features_top_n

## 5. Creating Features

### 5.1 Embarked

In [9]:
print(all_df.groupby(['Survived', 'Embarked'])['Survived'].count())
print(all_df['PassengerId'].groupby(by = all_df['Embarked']).count().sort_values(ascending = False))
print(all_df['Fare'].groupby(by = all_df['Embarked']).mean().sort_values(ascending = False))


Survived  Embarked
0         C           177
          Q            93
          S           697
1         C            93
          Q            30
          S           217
Name: Survived, dtype: int64
Embarked
S    914
C    270
Q    123
Name: PassengerId, dtype: int64
Embarked
C    62.336267
S    27.418824
Q    12.409012
Name: Fare, dtype: float64


In [10]:
if (all_df['Embarked'].isnull().sum() != 0):
    all_df['Embarked'].fillna(all_df['Embarked'].mode().iloc[0], inplace=True)

emb_dummies_df = pd.get_dummies(all_df['Embarked'],
                                prefix = all_df[['Embarked']].columns[0])
all_df = pd.concat([all_df, emb_dummies_df], axis = 1)
all_df.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171,0,0,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599,1,0,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282,0,0,1
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803,0,0,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450,0,0,1


### 5.2 Sex 

In [11]:
sex_dummies_df = pd.get_dummies(all_df['Sex'],
                                prefix = all_df[['Sex']].columns[0])
all_df = pd.concat([all_df, sex_dummies_df], axis = 1)
all_df.head(3)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171,0,0,1,0,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599,1,0,0,1,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282,0,0,1,1,0


### 5.3 Name Title

In [12]:
all_df['Title'] = all_df['Name'].str.extract('.+,(.+)').str.extract('^(.+?)\.').str.strip()
print(all_df['Title'].unique())
print(all_df['Title'].groupby(by = all_df['Title']).count().sort_values(ascending = False))

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer' 'Dona']
Title
Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Ms                2
Mlle              2
Major             2
Don               1
Dona              1
the Countess      1
Jonkheer          1
Lady              1
Sir               1
Mme               1
Capt              1
Name: Title, dtype: int64


  if __name__ == '__main__':


In [13]:
title_Dict = {}
title_Dict.update(dict.fromkeys(["Capt", "Col", "Major", "Dr", "Rev"], "Officer"))
title_Dict.update(dict.fromkeys(["Jonkheer", "Don", "Sir", "the Countess", "Dona", "Lady"], "Royalty"))
title_Dict.update(dict.fromkeys(["Mme", "Ms", "Mrs"], "Mrs"))
title_Dict.update(dict.fromkeys(["Mlle", "Miss"], "Miss"))
title_Dict.update(dict.fromkeys(["Mr", "Ms"], "Mr"))
title_Dict.update(dict.fromkeys(["Master"], "Master"))

In [16]:
all_df['Title'] = all_df['Title'].map(title_Dict)
print(all_df['Title'].groupby(by = all_df['Title']).count().sort_values(ascending = False))

title_dummies_df = pd.get_dummies(all_df['Title'],
                                prefix = all_df[['Title']].columns[0])
all_df = pd.concat([all_df, title_dummies_df], axis = 1)

Title
Mr         759
Miss       262
Mrs        198
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64


### 5.4 Name length

In [17]:
all_df['Name_Length'] = all_df['Name'].str.len()
print(all_df['Name_Length'].groupby(by = all_df['Name_Length']).count().sort_values(ascending = False)[:5])

Name_Length
25    83
19    82
18    75
26    73
27    70
Name: Name_Length, dtype: int64


In [18]:
all_df['Name_Length_Category'] = all_df['Name_Length'].map(name_len_category)
print(all_df['Name_Length_Category'].groupby(by = all_df['Name_Length_Category']).count().sort_values(ascending = False))

le_fare = preprocessing.LabelEncoder()
le_fare.fit(np.array(['Very_Short_Name', 'Short_Name', 'Medium_Name', 'Long_Name', 'Very_High_Fare']))
all_df['Name_Length_Category'] = le_fare.transform(all_df['Name_Length_Category'])

print(all_df[['Name_Length_Category', 'Survived']].corr())

first_name_dummies_df = pd.get_dummies(all_df['Name_Length_Category'],
                                prefix = all_df[['Name_Length_Category']].columns[0])
all_df = pd.concat([all_df, first_name_dummies_df], axis = 1)
all_df.head(3)

Name_Length_Category
Short_Name         592
Medium_Name        337
Very_Short_Name    292
Long_Name           88
Name: Name_Length_Category, dtype: int64
                      Name_Length_Category  Survived
Name_Length_Category              1.000000 -0.209793
Survived                         -0.209793  1.000000


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Name_Length,Name_Length_Category,Name_Length_Category_0,Name_Length_Category_1,Name_Length_Category_2,Name_Length_Category_4
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,1,0,0,0,23,2,0,0,1,0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,1,0,0,51,0,1,0,0,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,0,0,22,2,0,0,1,0


### 5.5 Fare

In [23]:
#filling the missing value using mean in Fare
if (all_df['Fare'].isnull().sum() != 0):
    all_df['Fare'] = all_df[['Fare']].fillna(all_df.groupby('Pclass').mean(axis=1))

In [25]:
#average fare for the tickets 
all_df['Group_Ticket'] = all_df['Fare'].groupby(by = all_df['Ticket']).transform('count')
all_df['Fare'] = all_df['Fare']/all_df['Group_Ticket']
all_df.drop(['Group_Ticket'], axis = 1, inplace = True)
all_df.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Name_Length,Name_Length_Category,Name_Length_Category_0,Name_Length_Category_1,Name_Length_Category_2,Name_Length_Category_4
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,1,0,0,0,23,2,0,0,1,0
1,38.0,C85,C,35.64165,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,1,0,0,51,0,1,0,0,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,0,0,22,2,0,0,1,0
3,35.0,C123,S,26.55,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,1,0,0,44,1,0,1,0,0
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,1,0,0,0,24,2,0,0,1,0


In [26]:
#check if there are wired value, which is fare =0 
if (sum(n == 0 for n in all_df.Fare.values.flatten()) > 0):
    all_df.loc[all_df.Fare == 0, 'Fare'] = np.nan
    all_df['Fare'] = all_df[['Fare']].fillna(all_df.groupby('Pclass').transform('mean'))

all_df['Fare'].describe()

count    1309.000000
mean       15.017884
std        13.529548
min         3.170800
25%         7.666667
50%         8.300000
75%        15.050000
max       128.082300
Name: Fare, dtype: float64

In [27]:
#transform fare to fare_category 
all_df['Fare_Category'] = all_df['Fare'].map(fare_category)
le_fare = preprocessing.LabelEncoder()
le_fare.fit(np.array(['Very_Low_Fare', 'Low_Fare', 'Med_Fare', 'High_Fare', 'Very_High_Fare']))
all_df['Fare_Category'] = le_fare.transform(all_df['Fare_Category'])

fare_cat_dummies_df = pd.get_dummies(all_df['Fare_Category'],
                                prefix = all_df[['Fare_Category']].columns[0])
all_df = pd.concat([all_df, fare_cat_dummies_df], axis = 1)

print(all_df['Fare_Category'].groupby(by = all_df['Fare_Category']).count().sort_values(ascending = False))

Fare_Category
1    745
2    408
0    112
3     40
4      4
Name: Fare_Category, dtype: int64


### 5.6 Pclass

In [28]:
print(all_df['Fare'].groupby(by = all_df['Pclass']).mean())
Pclass_1_mean_fare = all_df['Fare'].groupby(by = all_df['Pclass']).mean().get([1]).values[0]
Pclass_2_mean_fare = all_df['Fare'].groupby(by = all_df['Pclass']).mean().get([2]).values[0]
Pclass_3_mean_fare = all_df['Fare'].groupby(by = all_df['Pclass']).mean().get([3]).values[0]

Pclass
1    34.661682
2    11.663652
3     7.379203
Name: Fare, dtype: float64


In [29]:
#category variable from Pclass and Fare 
all_df['Pclass_Fare_Category'] = all_df.apply(pclass_fare_category, args=(Pclass_1_mean_fare, Pclass_2_mean_fare, Pclass_3_mean_fare), axis = 1)
print(all_df['Pclass_Fare_Category'].groupby(by = all_df['Pclass_Fare_Category']).count().sort_values(ascending = False))

le_fare = preprocessing.LabelEncoder()
le_fare.fit(np.array(['Pclass_1_Low_Fare', 'Pclass_1_High_Fare', 'Pclass_2_Low_Fare', 'Pclass_2_High_Fare', 'Pclass_3_Low_Fare', 'Pclass_3_High_Fare']))
all_df['Pclass_Fare_Category'] = le_fare.transform(all_df['Pclass_Fare_Category'])

Pclass_Fare_Category
Pclass_3_High_Fare    432
Pclass_3_Low_Fare     277
Pclass_1_Low_Fare     209
Pclass_2_High_Fare    155
Pclass_2_Low_Fare     122
Pclass_1_High_Fare    114
Name: Pclass_Fare_Category, dtype: int64


In [30]:
print(all_df['Fare'].groupby(by = all_df['Pclass']).mean().sort_values(ascending = True))
all_df['Pclass'].replace([1, 2, 3],[Pclass_1_mean_fare, Pclass_2_mean_fare, Pclass_3_mean_fare], inplace = True)


Pclass
3     7.379203
2    11.663652
1    34.661682
Name: Fare, dtype: float64


### 5.7 Parch and SibSp

In [31]:
all_df['Family_Size'] = all_df['Parch'] + all_df['SibSp'] + 1
print(all_df['Family_Size'].groupby(by = all_df['Family_Size']).count().sort_values(ascending = False))

all_df['Family_Size_Category'] = all_df['Family_Size'].map(family_size_category)

print(all_df['Family_Size_Category'].groupby(by = all_df['Family_Size_Category']).count().sort_values(ascending = False))
print(all_df.groupby(['Survived', 'Family_Size_Category'])['Survived'].count())

le_family = preprocessing.LabelEncoder()
le_family.fit(np.array(['Single', 'Small_Family', 'Large_Family']))
all_df['Family_Size_Category'] = le_family.transform(all_df['Family_Size_Category'])

fam_size_cat_dummies_df = pd.get_dummies(all_df['Family_Size_Category'],
                                prefix = all_df[['Family_Size_Category']].columns[0])
all_df = pd.concat([all_df, fam_size_cat_dummies_df], axis = 1)

Family_Size
1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: Family_Size, dtype: int64
Family_Size_Category
Single          790
Small_Family    394
Large_Family    125
Name: Family_Size_Category, dtype: int64
Survived  Family_Size_Category
0         Large_Family             94
          Single                  627
          Small_Family            246
1         Large_Family             31
          Single                  163
          Small_Family            148
Name: Survived, dtype: int64


### 5.8 Age

In [32]:
print(all_df['Age'].groupby(by = all_df['Title']).mean().sort_values(ascending = True))

Title
Master      5.482642
Miss       21.795236
Mr         32.244845
Mrs        36.918129
Royalty    41.166667
Officer    46.272727
Name: Age, dtype: float64


In [33]:
#flag the age null to 1 and not null in 0 
all_df['Age_Null'] = all_df['Age'].apply(lambda x: 1 if(pd.notnull(x)) else 0)

#prepare the dataframe for training 
missing_age_df = pd.DataFrame(all_df[['Age', 'Parch', 'Sex', 'SibSp', 'Family_Size', 'Family_Size_Category', 'Title', 'Fare', 'Fare_Category', 'Pclass', 'Embarked']])
missing_age_df = pd.get_dummies(missing_age_df, columns = ['Title', 'Family_Size_Category', 'Fare_Category', 'Sex', 'Pclass', 'Embarked'])
missing_age_df.shape
missing_age_df.info()

missing_age_train = missing_age_df[missing_age_df['Age'].notnull()]
missing_age_test  = missing_age_df[missing_age_df['Age'].isnull()]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 27 columns):
Age                       1046 non-null float64
Parch                     1309 non-null int64
SibSp                     1309 non-null int64
Family_Size               1309 non-null int64
Fare                      1309 non-null float64
Title_Master              1309 non-null uint8
Title_Miss                1309 non-null uint8
Title_Mr                  1309 non-null uint8
Title_Mrs                 1309 non-null uint8
Title_Officer             1309 non-null uint8
Title_Royalty             1309 non-null uint8
Family_Size_Category_0    1309 non-null uint8
Family_Size_Category_1    1309 non-null uint8
Family_Size_Category_2    1309 non-null uint8
Fare_Category_0           1309 non-null uint8
Fare_Category_1           1309 non-null uint8
Fare_Category_2           1309 non-null uint8
Fare_Category_3           1309 non-null uint8
Fare_Category_4           1309 non-null uint8
Sex_female      

In [34]:
#use the before function to fill the missing Age 
all_df.loc[(all_df.Age.isnull()), 'Age']= fill_missing_age(missing_age_train, missing_age_test)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:    1.3s remaining:    1.3s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:    1.4s finished


Age feature Best GB Params: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 3, 'n_estimators': 2000}
Age feature Best GB Score: -112.449390637
GB Train Error for 'Age' Feature Regressor: -91.3369631585
5     33.546420
17    33.246757
19    33.463058
26    26.526942
Name: Age_GB, dtype: float64
Fitting 10 folds for each of 1 candidates, totalling 10 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Age feature Best LR Params: {'fit_intercept': True, 'normalize': True}
Age feature Best LR Score: -120.164070814
LR Train Error for 'Age' Feature Regressor: -114.436752269
5     34.50000
17    33.59375
19    31.12500
26    26.84375
Name: Age_LRF, dtype: float64
5     34.023210
17    33.420254
19    32.294029
26    26.685346
Name: Age, dtype: float64


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:    0.1s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [35]:
print(all_df['Age'].groupby(by = all_df['Title']).mean().sort_values(ascending = True))

Title
Master      5.438024
Miss       21.694561
Mr         32.148151
Mrs        36.837474
Royalty    41.166667
Officer    46.465499
Name: Age, dtype: float64


In [36]:
all_df['Age_Category'] = all_df['Age'].map(age_group_cat)
le_age = preprocessing.LabelEncoder()
le_age.fit(np.array(['Baby', 'Toddler', 'Child', 'Teenager', 'Adult', 'Middle_Aged', 'Senior_Citizen', 'Old']))
all_df['Age_Category'] = le_age.transform(all_df['Age_Category'])

age_cat_dummies_df = pd.get_dummies(all_df['Age_Category'],
                                prefix = all_df[['Age_Category']].columns[0])
all_df = pd.concat([all_df, age_cat_dummies_df], axis = 1)

### 5.9 Ticket & Cabin

In [43]:
all_df['Ticket_Letter'] = all_df['Ticket'].str.split().str[0]
all_df['Ticket_Letter'] = all_df['Ticket_Letter'].apply(lambda x: np.NaN if x.isnumeric() else x)
all_df['Ticket_Number'] = all_df['Ticket'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
all_df['Ticket_Number'].fillna(0, inplace = True)
all_df = pd.get_dummies(all_df, columns = ['Ticket', 'Ticket_Letter'])
all_df.shape

(1309, 1031)

In [44]:
all_df['Cabin_Letter'] = all_df['Cabin'].apply(lambda x: str(x)[0]  if(pd.notnull(x)) else x)
all_df = pd.get_dummies(all_df, columns = ['Cabin', 'Cabin_Letter'])
all_df.shape

(1309, 1224)

### 5.10 Normalize Age and Fare

In [47]:
all_df[['Age', 'Fare']][:5]

Unnamed: 0,Age,Fare
0,22.0,7.25
1,38.0,35.64165
2,26.0,7.925
3,35.0,26.55
4,35.0,8.05


In [48]:
scale_age_fare = preprocessing.StandardScaler().fit(all_df[['Age', 'Fare']])
all_df[['Age', 'Fare']] = scale_age_fare.transform(all_df[['Age', 'Fare']])

In [52]:
all_df[['Age', 'Fare']].mean()

Age    -2.705586e-16
Fare   -7.667239e-17
dtype: float64

### 5.11 Drop columns 

In [53]:
all_df.drop(['Name', 'PassengerId', 'Embarked', 'Sex', 'Title', 'Fare_Category', 'Family_Size_Category',
               'Age_Category', 'Name_Length_Category'], 
              axis = 1, inplace = True)

In [54]:
train_data = all_df[:891]
test_data = all_df[891:]

titanic_train_data_X = train_data.drop(['Survived'], axis = 1)
titanic_train_data_y = train_data['Survived']

titanic_test_data_X = test_data.drop(['Survived'], axis = 1)

In [55]:
titanic_test_data_X.shape

(418, 1214)

## 6. Choose Top features

In [59]:
features_to_pick = 200
features_top_n = get_top_n_features(titanic_train_data_X, titanic_train_data_y, features_to_pick)

print("Total Features: " + str(all_df.shape))
print("Picked Features: " + str(features_top_n.shape))

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:   10.5s remaining:    5.6s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:   10.5s finished


Top N Features Best RF Params: {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
Top N Features Best RF Score: 0.83164983165
Top N Features RF Train Error: 0.94051627385
Sample 25 Features from RF Classifier:
8                 Sex_female
9                   Sex_male
12                  Title_Mr
1                       Fare
16               Name_Length
0                        Age
40             Ticket_Number
11                Title_Miss
26      Pclass_Fare_Category
13                 Title_Mrs
3                     Pclass
27               Family_Size
22           Fare_Category_1
4                      SibSp
30    Family_Size_Category_2
2                      Parch
18    Name_Length_Category_1
7                 Embarked_S
5                 Embarked_C
17    Name_Length_Category_0
19    Name_Length_Category_2
29    Family_Size_Category_1
20    Name_Length_Category_4
23           Fare_Category_2
32            Age_Category_0
Name: feature, dtype: object
Fitting 10 folds for eac

[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:   19.7s remaining:   10.6s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:   19.7s finished


Top N Features Best Ada Params: {'learning_rate': 0.5, 'n_estimators': 500}
Top N Features Best Ada Score: 0.852974186308
Top N Features Ada Train Error: 0.998877665544
Sample 25 Features from Ada Classifier:
40             Ticket_Number
0                        Age
1                       Fare
9                   Sex_male
8                 Sex_female
16               Name_Length
10              Title_Master
145              Ticket_1601
12                  Title_Mr
1099           Cabin_C22 C26
1012    Ticket_Letter_STON/O
1016     Ticket_Letter_W./C.
335              Ticket_2699
828              Ticket_LINE
27               Family_Size
495            Ticket_347077
1209          Cabin_Letter_D
976       Ticket_Letter_A/5.
7                 Embarked_S
1208          Cabin_Letter_C
989       Ticket_Letter_LINE
772        Ticket_A/5. 10482
110             Ticket_11668
23           Fare_Category_2
26      Pclass_Fare_Category
Name: feature, dtype: object
Fitting 10 folds for each of 2 candid

[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:    9.4s remaining:    5.0s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:    9.4s finished


Top N Features Best ET Params: {'max_depth': 15, 'min_samples_split': 4, 'n_estimators': 500}
Top N Features Best ET Score: 0.836139169473
Top N Features ET Train Error: 0.920314253648
Sample 25 Features from ET Classifier:
12                    Title_Mr
8                   Sex_female
9                     Sex_male
11                  Title_Miss
3                       Pclass
13                   Title_Mrs
22             Fare_Category_1
26        Pclass_Fare_Category
30      Family_Size_Category_2
16                 Name_Length
17      Name_Length_Category_0
1                         Fare
23             Fare_Category_2
18      Name_Length_Category_1
29      Family_Size_Category_1
27                 Family_Size
7                   Embarked_S
20      Name_Length_Category_4
0                          Age
28      Family_Size_Category_0
4                        SibSp
1207            Cabin_Letter_B
5                   Embarked_C
10                Title_Master
1210            Cabin_Letter_E
N

In [67]:
titanic_train_data_X = titanic_train_data_X[features_top_n]
print(titanic_train_data_X.shape)
print(titanic_train_data_X.info())

(891, 286)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Columns: 286 entries, Sex_female to Ticket_S.O./P.P. 3
dtypes: float64(4), int64(6), uint8(276)
memory usage: 316.7 KB
None


In [61]:
titanic_test_data_X = titanic_test_data_X[features_top_n]
titanic_test_data_X.shape
titanic_test_data_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Columns: 286 entries, Sex_female to Ticket_S.O./P.P. 3
dtypes: float64(4), int64(6), uint8(276)
memory usage: 148.6 KB


In [103]:
titanic_train_data = pd.concat([titanic_train_data_X, titanic_train_data_y], axis = 1)

In [107]:
titanic_train_data_X.shape

(891, 286)

In [104]:
titanic_train_data.shape

(891, 287)

In [108]:
titanic_test_data_X.to_csv("data/topfeature_test.csv", index=False)
titanic_train_data.to_csv("data/topfeature_train.csv", index=False)