In [28]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import copy

In [2]:
d_tr = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/train.csv', index_col=0)
d_te = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/test.csv', index_col=0)

### X, y

In [3]:
X = d_tr.drop(columns=['Survived', 'Ticket', 'Cabin']).copy()
y = d_tr.Survived.copy().astype('int')

### Columns to work with

In [5]:
X.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

# Transformers

In [6]:
## Age
impute_age = ColumnTransformer([
                                ('age', SimpleImputer(strategy='mean'), ['Age'])
                               ])

age_trans = Pipeline([
                      ('impute', impute_age), 
                      ('encode', age_func)
                     ])

## Embarked
impute_embar = ColumnTransformer([
                                  ('embarked', SimpleImputer(strategy='most_frequent'), ['Embarked'])
                                 ])

embar_trans = Pipeline([
                        ('impute', impute_embar), 
                        ('encode', OrdinalEncoder()) # try su use ordinalEncode directly
                       ])

## Fare
fare_trans = ColumnTransformer([
                                ('encode', fare_func, ['Fare'])
                               ])

## 'SibSp', 'Parch' -> IsAlone
familySize_trans = ColumnTransformer([
                                      ('size', familySize_func, ['SibSp', 'Parch'])
                                     ])

## Sex
sex_trans = ColumnTransformer([
                               ('sex', codeSex_func, ['Sex'])
                              ])

## names -> Titles
getTitles_trans = ColumnTransformer([
                                     ('titles', getTitles_func, ['Name'])
                                    ])

encodeTiles_trans = Pipeline([
                              ('title_trans', getTitles_trans), 
                              ('oh', OneHotEncoder(handle_unknown='ignore'))
                             ])

In [7]:
preprocesor = FeatureUnion([
                            ('age_t', age_trans),        # add 1 colunm
                            ('embarked_t', embar_trans), # add 1 colunm
                            ('fare', fare_trans),        # add 1 colunm
                            ('size', familySize_trans),  # add 1 colunm
                            ('sex', sex_trans),          # add 2 colunm
                            ('title', encodeTiles_trans) # add 5 colunm
                           ])

In [76]:
X_trans = preprocesor.fit_transform(X.copy())
X_trans.shape

(891, 5)

# Tunning

# ⚡️ GradientBoostingClassifier

In [16]:
classifier = Pipeline([
                       ('pre', preprocesor), 
                       ('model', GradientBoostingClassifier())
                      ])

In [25]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

param_grid = [{ #regular grid search took about 1 ahour with KNNImputer
                'model__n_estimators': [10, 100, 1000],
                'model__learning_rate': [0.001, 0.01, 0.1],
                'model__subsample': [0.5, 0.7, 1.0],
                'model__max_depth': [3, 7, 9],
    
#            'strategy for numrical': ['mean', 'median', 'constant', 'most_frequent']
                'pre__age_t__impute__age__strategy': ['mean', 'median', 'constant', 'most_frequent'],#
                'pre__embarked_t__impute__embarked__strategy': ['most_frequent'],
              }]

In [28]:
gs.best_score_

0.7998543487307531

In [30]:
gs.best_params_

{'model__learning_rate': 0.01,
 'model__max_depth': 7,
 'model__n_estimators': 100,
 'model__subsample': 0.5,
 'pre__age_t__impute__age__strategy': 'constant',
 'pre__embarked_t__impute__embarked__strategy': 'most_frequent'}

# ⚡️ BaggingClassifier

In [12]:
model = BaggingClassifier(base_estimator=DecisionTreeClassifier())

param_grid = [{#  *** first change parameters in base_estimator ***
               'model__base_estimator__max_leaf_nodes': [10, 20, 30],
               'model__base_estimator__criterion': ["gini", "entropy"],
    
               'model__n_estimators': [10, 100, 1000],
               
               'pre__age_t__impute__age__strategy': ['mean', 'median', 'constant', 'most_frequent'],#
               'pre__embarked_t__impute__embarked__strategy': ['most_frequent'],
               }]

In [18]:
gs.best_score_

0.7986766541822721

In [17]:
gs.best_params_

{'model__base_estimator__criterion': 'gini',
 'model__base_estimator__max_leaf_nodes': 20,
 'model__n_estimators': 100,
 'pre__age_t__impute__age__strategy': 'constant',
 'pre__embarked_t__impute__embarked__strategy': 'most_frequent'}

# ⚡️  RandomForestClassifier

In [22]:
model = RandomForestClassifier(n_jobs=-1)

param_grid = [{ # 216 iter in a regular grid search
                'model__criterion' : ["gini", "entropy"],
                'model__n_estimators': [10, 100, 1000],
                'model__max_features': ['sqrt', 'log2'],
                'model__class_weight' : [ None, "balanced", "balanced_subsample"],
    
    
               'pre__age_t__impute__age__strategy': ['mean', 'median', 'constant', 'most_frequent'],
               'pre__embarked_t__impute__embarked__strategy': ['most_frequent'],
               }] 

In [25]:
gs.best_score_

0.7870994590095712

In [26]:
gs.best_params_

{'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_features': 'log2',
 'model__n_estimators': 1000,
 'pre__age_t__impute__age__strategy': 'median',
 'pre__embarked_t__impute__embarked__strategy': 'most_frequent'}

# ⚡️ LogisticRegression

In [29]:
model = LogisticRegression(max_iter=10000)

param_grid = [{ 
              'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'model__penalty': ['l2'],
              'model__C': [0.01, 0.1, 1, 10, 100],
              'model__class_weight' : ['balanced', None],
    
    
              'pre__age_t__impute__age__strategy': ['mean', 'median', 'constant', 'most_frequent'],
              'pre__embarked_t__impute__embarked__strategy': ['most_frequent'],
              
              }]

In [32]:
gs.best_score_

0.7886516853932585

In [33]:
gs.best_params_

{'model__C': 0.01,
 'model__class_weight': None,
 'model__penalty': 'l2',
 'model__solver': 'liblinear',
 'pre__age_t__impute__age__strategy': 'constant',
 'pre__embarked_t__impute__embarked__strategy': 'most_frequent'}

# =============------================

# ⚡️💡  Rating    💡⚡️

In [30]:
classifier = Pipeline([
                       ('pre', preprocesor), 
                       ('model', model)
                      ])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
gs = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='accuracy', verbose=1)

In [31]:
gs.fit(X, y)

Fitting 30 folds for each of 200 candidates, totalling 6000 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=None),
             estimator=Pipeline(steps=[('pre',
                                        FeatureUnion(transformer_list=[('age_t',
                                                                        Pipeline(steps=[('impute',
                                                                                         ColumnTransformer(transformers=[('age',
                                                                                                                          SimpleImputer(),
                                                                                                                          ['Age'])])),
                                                                                        ('encode',
                                                                                         FunctionTransformer(func=<function encodeAge at 0x7f835f664af0>))])),
                          

# Tools

In [5]:

def encodeFare(df):
    ''' '''
    
    s = copy.deepcopy(df)
    
    s[ (s <= 7.91) ] = 0
    s[(s > 7.91) & (s <= 14.454)] = 1
    s[(s > 14.454) & (s <= 31)]   = 2
    s[ (s > 31) ] = 3
    
    return s

def encodeAge(df): #if it uses 2 times it will set all values to 1.0
            #recive a copy
            #does not touch nan values
    
    s = copy.deepcopy(df)
    s[(s <= 13)] = 1
    s[(s > 13) & (s <= 18)] = 2
    s[(s > 18) & (s <= 30)] = 3
    s[(s > 30) & (s <= 50)] = 4
    s[(s > 50)] = 5
    
    return s

def familySize(df):
    
    s = np.zeros( (len(df.index), 1) )
    
    # if overall members family are equeal to 1 mean they ar alone
    s[ ( (df['SibSp'] +  df['Parch'] + 1) == 1 ) ] = 1
    
    
    #s['IsAlone'] = 0
    #s.loc[s['FamilySize'] == 1, 'IsAlone'] = 1
    
    return s#.drop(columns=['SibSp', 'Parch', 'FamilySize']).copy()


def getTitles(df):
    ''' '''
    
    s = copy.deepcopy(df)
#     print(s.sample(7))
#     assert 0
    s['Title'] = s['Name'].str.extract(' ([A-Za-z]+)\.')
    s['Title'] = s['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr',                                       
                                    'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    s['Title'] = s['Title'].replace(['Mlle', 'Ms'], 'Miss')
    s['Title'] = s['Title'].replace(['Mme'], 'Mrs')
    
#     resulting titles = 'Mr', 'Miss', 'Mrs', 'Master', 'Other'
    
#     print(s.sample(7))
#     print(s.Title.value_counts())

    s = s.drop(columns='Name').copy()
    
#     print(s.value_counts())
#     assert 0

    return s

# def encodeTitles(df):
#     ''' '''
    
#     s = copy.deepcopy(df)
    
#     print(s.shape)
    
    
#     oh2 = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
#     s = oh2.fit_transform(s)
    
# #     np_a2 = np.sum(s)
# #     print(np.isnan(np_a2))
    
# #     print(type(s))
# #     print(s.shape)
# #     assert 0
    
#     return s
    

def encodeEmbarked(df):
    '''   '''
    
    s = copy.deepcopy(df)
    
    ordinal = OrdinalEncoder()
    
    return ordinal.fit_transform(s)

def encodeSex(df):
    ''' '''
    
    s = copy.deepcopy(df)
    oh = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    return oh.fit_transform(s)


embarked_func = FunctionTransformer(encodeEmbarked, validate=False)

getTitles_func = FunctionTransformer(getTitles, validate=False)
# encodeTitles_func = FunctionTransformer(encodeTitles, validate=False)

fare_func = FunctionTransformer(encodeFare, validate=False)
age_func = FunctionTransformer(encodeAge, validate=False)
familySize_func = FunctionTransformer(familySize, validate=False)
codeSex_func = FunctionTransformer(encodeSex, validate=False)


