In [1]:
from sklearn.experimental import enable_iterative_imputer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import copy

In [2]:
d_tr = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/train.csv', index_col='PassengerId')
d_te = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/test.csv', index_col=0)

In [3]:
X = d_tr.drop(columns='Survived').copy()
y = d_tr.Survived.copy().astype('int') #To avoid getting an error that 'y' is unknown

> Cleaning and encoding __X__ precerving **nan**

In [7]:
X = preprocesor.fit_transform(X)

In [8]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, stratify=y, test_size=0.16)

# == -- GradientBoostingClassifier --==

In [23]:
model = GradientBoostingClassifier()

# imputer = KNNImputer()
imputer = IterativeImputer(max_iter=20) #default is 10 iteractions

param_grid = [{ #regular grid search took about 1 ahour with KNNImputer
            'model__n_estimators': [10, 100, 1000],
            'model__learning_rate': [0.001, 0.01, 0.1],
            'model__subsample': [0.5, 0.7, 1.0],
            'model__max_depth': [3, 7, 9],
    
            'impute__imputation_order': ['ascending', 'descending', 'roman', 'arabic', 'random']
#             'impute__n_neighbors': [1,2,3,5,7,11,13,17,19]
              }]

In [24]:
classifier = Pipeline([
                       ('impute', imputer),
                       ('model', model)
                      ])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

gs = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='accuracy', verbose=1)

# rs = RandomizedSearchCV(classifier, param_distributions=param_grid, cv=cv, n_jobs=-1, 
#                         scoring='accuracy', n_iter=50, verbose=1)


In [25]:
gs.fit(X_tr, y_tr)

Fitting 30 folds for each of 405 candidates, totalling 12150 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=None),
             estimator=Pipeline(steps=[('impute',
                                        IterativeImputer(max_iter=20)),
                                       ('model',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid=[{'impute__imputation_order': ['ascending',
                                                       'descending', 'roman',
                                                       'arabic', 'random'],
                          'model__learning_rate': [0.001, 0.01, 0.1],
                          'model__max_depth': [3, 7, 9],
                          'model__n_estimators': [10, 100, 1000],
                          'model__subsample': [0.5, 0.7, 1.0]}],
             scoring='accuracy', verbose=1)

# KNN

In [22]:
# GridSearch 21870 fit about 80 minutes
print('Training score: {:.4f}'.format(gs.best_score_))
print('Test score    : {:.4f}'.format(accuracy_score(y_te, gs.predict(X_te))))
gs.best_params_

Training score: 0.8119
Test score    : 0.7762


{'impute__n_neighbors': 1,
 'model__learning_rate': 0.1,
 'model__max_depth': 3,
 'model__n_estimators': 1000,
 'model__subsample': 1.0}

# Iteractive

In [26]:
# around 50 minutes and get the same result of RandomizedSearchCV= n_iter=50 (20 minutes)
# GridSearch 12150 fit
print('Training score: {:.4f}'.format(gs.best_score_))
print('Test score    : {:.4f}'.format(accuracy_score(y_te, gs.predict(X_te))))
gs.best_params_

Training score: 0.8093
Test score    : 0.7832


{'impute__imputation_order': 'ascending',
 'model__learning_rate': 0.01,
 'model__max_depth': 9,
 'model__n_estimators': 1000,
 'model__subsample': 0.7}

> __Custom Functions__

In [4]:
def isCabinNan(df):
    ''' Recive a data frame'''
           
    df = copy.deepcopy(df)
    
    s = pd.DataFrame(df['Cabin'].notnull().astype('int'), columns=['Cabin'])
    
    return s

def encodeFare(df):
    ''' '''
    
    s = copy.deepcopy(df)
    
    s[ (s <= 7.91) ] = 0
    s[(s > 7.91) & (s <= 14.454)] = 1
    s[(s > 14.454) & (s <= 31)]   = 2
    s[ (s > 31) ] = 3
    
    return s

def encodeAge(df):                       
    ''' Recive a pd Data Frame '''        
                                        
    
    s = copy.deepcopy(df)
    
    s.loc[(s.Age <= 13), 'Age'] = 1
    s.loc[(s.Age > 13) & (s.Age <= 18), 'Age'] = 2
    s.loc[(s.Age > 18) & (s.Age <= 30), 'Age'] = 3
    s.loc[(s.Age > 30) & (s.Age <= 50), 'Age'] = 4
    s.loc[(s.Age > 50), 'Age'] = 5
    
#     s[(s <= 13)] = 1
#     s[(s > 13) & (s <= 18)] = 2
#     s[(s > 18) & (s <= 30)] = 3
#     s[(s > 30) & (s <= 50)] = 4
#     s[(s > 50)] = 5
    
    return s

def familySize(df):
    ''' Recive a data frama'''
    
    s = np.zeros( (len(df.index), 1) )
    
    # if overall members family are equeal to 1 mean they ar alone
    s[ ( (df['SibSp'] +  df['Parch'] + 1) == 1 ) ] = 1
    
    return s

def getTitles(df):
    ''' Recive a data frame'''
    
    s = copy.deepcopy(df)
    
    s['Title'] = s['Name'].str.extract(' ([A-Za-z]+)\.')
    s['Title'] = s['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr',                                       
                                    'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    s['Title'] = s['Title'].replace(['Mlle', 'Ms'], 'Miss')
    s['Title'] = s['Title'].replace(['Mme'], 'Mrs')
    
#     resulting titles = 'Mr', 'Miss', 'Mrs', 'Master', 'Other'
    
    s = s.drop(columns='Name').copy()
    
    return s
    
def encodeSex(df):
    ''' Recive a data frame'''
    
    s = copy.deepcopy(df)
    oh = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    return oh.fit_transform(s)


isCabinNan_func = FunctionTransformer(isCabinNan, validate=False)
getTitles_func = FunctionTransformer(getTitles, validate=False)
fare_func = FunctionTransformer(encodeFare, validate=False)
age_func = FunctionTransformer(encodeAge, validate=False)
familySize_func = FunctionTransformer(familySize, validate=False)
codeSex_func = FunctionTransformer(encodeSex, validate=False)



> Transformers

In [5]:
## Age
age_trans = ColumnTransformer([
                                ('age', age_func, ['Age']) 
                               ])

## Embarked
embar_trans = ColumnTransformer([             # this returns a np.array
                                  ('embarked', OrdinalEncoder(), ['Embarked'])
                                 ])

## Fare
fare_trans = ColumnTransformer([
                                ('encode', fare_func, ['Fare'])
                               ])

## 'SibSp', 'Parch' -> IsAlone
familySize_trans = ColumnTransformer([
                                      ('size', familySize_func, ['SibSp', 'Parch'])
                                     ])

## Sex
sex_trans = ColumnTransformer([
                               ('sex', OneHotEncoder(handle_unknown='ignore', sparse=False), ['Sex']) #creo puedo usar directo oneHot encoder
                              ])                              #OneHotEncoder(handle_unknown='ignore', sparse=False)

## names -> Titles
getTitles_trans = ColumnTransformer([
                                     ('titles', getTitles_func, ['Name'])
                                    ])

encodeTiles_trans = Pipeline([
                              ('title_trans', getTitles_trans), 
                              ('oh', OneHotEncoder(handle_unknown='ignore', sparse=False)) #if sparce = true
                             ])                                             # returns scipy.sparse.csr.csr_matrix

## Cabin 
isCabinNan_trans = ColumnTransformer([ 
                                      ('cabin', isCabinNan_func, ['Cabin'])
                                     ])




> Preprocesor

In [6]:
preprocesor = FeatureUnion([                                            
                            ('age_t', age_trans),        # add 1 colunm  
                            ('embarked_t', embar_trans), # add 1 colunm 
                            ('fare', fare_trans),        # add 1 colunm 
                            ('size', familySize_trans),  # add 1 colunm
                            ('sex', sex_trans),          # add 2 colunm
                            ('title', encodeTiles_trans),# add 5 colunm
                            ('cabin', isCabinNan_trans)  # add 1 colunm
                           ])

# to test diferent parts of the code

In [31]:
data = preprocesor.fit_transform(d_tr)
print(type(data))
data[814:834]

<class 'numpy.ndarray'>


array([[ 4.,  2.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [nan,  2.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.],
       [ 3.,  2.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 4.,  0.,  3.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 4.,  2.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  2.,  2.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 5.,  2.,  3.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.],
       [ 3.,  2.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 4.,  2.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 3.,  2.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.],
       [ 1.,  2.,  3.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [nan,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [nan,  2.,  3.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  3.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [nan,  1.,  0.,  1.,  0.,  

In [17]:
d_tr[d_tr.Embarked.isna()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,
