In [29]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import copy

In [2]:
d_tr = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/train.csv', index_col=0)
d_te = pd.read_csv('/Users/antirrabia/Documents/01-GitHub/DataMining-_-/CSV/Titanic/test.csv', index_col=0)

In [3]:
d_tr.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [10]:
X = d_tr.drop(columns=['Survived', 'Ticket', 'Cabin']).copy()
y = d_tr.Survived.copy()

In [48]:
## Age
impute_age = ColumnTransformer([ ('age', SimpleImputer(), ['Age'])
                               ])

age_trans = Pipeline([('impute', impute_age), 
                      ('encode', age_func)
                     ])

## Embarked
impute_embar = ColumnTransformer([('embarked', SimpleImputer(strategy='most_frequent'), ['Embarked'])
                                 ])

embar_trans = Pipeline([('impute', impute_embar), 
                        ('encode', embarked_func)
                       ])

## Fare
fare_trans = ColumnTransformer([('encode', fare_func, ['Fare'])])

## 'SibSp', 'Parch' to IsAlone
familySize_trans = ColumnTransformer([('size', familySize_func, ['SibSp', 'Parch'])])

In [49]:
preprocesor = FeatureUnion([('age_', age_trans), 
                            ('embarked_', embar_trans), 
                            ('fare', fare_trans), 
                            ('size', familySize_trans)
                           ])

In [50]:
preprocesor.fit_transform(X.copy())

array([[3., 2., 0., 0.],
       [4., 0., 3., 0.],
       [3., 2., 1., 1.],
       ...,
       [3., 2., 2., 0.],
       [3., 0., 2., 1.],
       [4., 1., 0., 1.]])

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

param_grid = [{ #regular grid search took about 1 ahour with KNNImputer
            'model__n_estimators': [10, 100, 1000],
            'model__learning_rate': [0.001, 0.01, 0.1],
            'model__subsample': [0.5, 0.7, 1.0],
            'model__max_depth': [3, 7, 9],
    
#             'imputer__imputer__strategy': ['mean', 'median', 'constant', 'most_frequent']
            'imputer__num_imputer__strategy': ['mean'], # , median, constant, most_frequent
            'imputer__cab_imputer__strategy': ['constant', 'most_frequent'], # constant, most_frequent
            'imputer__cab_imputer__strategy': ['most_frequent']
              }]

In [None]:
#pipeline = Pipeline([('clean_d', cleanData)], verbose=True)

# classifier = Pipeline([('imputer', impute), 
#                      ('embarked', codeEmbarked_toPipeline_trans),
#                      ("titles", getTitles_trans),
#                      ("fare", fare_trans),
#                      ('age', age_trans),
#                      ('family', familySize_trans), 
#                      ('oneHot', sexTitle_to_OneHot_trans), 
#                      ('model', model)])

cleaner = Pipeline([ 
                     ('imputer', impute), 
    ('to_df', toDF_trans)
                     ('embarked', codeEmbarked_trans), # para clean: codeEmbarked_trans
                     ("titles", getTitles_trans),
#                     ("fare", fare_trans),
#                      ('age', age_trans),
#                      ('family', familySize_trans), 
#                      ('oneHot', sexTitle_to_OneHot_trans) 
                   ])

In [47]:

def encodeFare(df):
    ''' '''
    
    s = copy.deepcopy(df)
    
    s[ (s <= 7.91) ] = 0
    s[(s > 7.91) & (s <= 14.454)] = 1
    s[(s > 14.454) & (s <= 31)]   = 2
    s[ (s > 31) ] = 3
    
    return s

def age(df): #if it uses 2 times it will set all values to 1.0
            #recive a copy
            #does not touch nan values
    
    s = copy.deepcopy(df)
    s[(s <= 13)] = 1
    s[(s > 13) & (s <= 18)] = 2
    s[(s > 18) & (s <= 30)] = 3
    s[(s > 30) & (s <= 50)] = 4
    s[(s > 50)] = 5
    
    return s

def familySize(df):
    
    s = np.zeros( (len(df.index), 1) )
    
        # si esta suma es IGUA a 1 poner 1, en OTRO CASO poner 'CERO'
    s[ ( (df['SibSp'] +  df['Parch'] + 1) == 1 ) ] = 1
    
    
    #s['IsAlone'] = 0
    #s.loc[s['FamilySize'] == 1, 'IsAlone'] = 1
    
    return s#.drop(columns=['SibSp', 'Parch', 'FamilySize']).copy()


def getTitles(df):
    #str.extract(' ([A-Za-z]+)\.')
    #print(type(s))
    
    s = df.copy()
    #print(s.sample(7))
    #assert 0
    s['Title'] = s['Name'].str.extract(' ([A-Za-z]+)\.')
    s['Title'] = s['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr',                                       
                                    'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    s['Title'] = s['Title'].replace(['Mlle', 'Ms'], 'Miss')
    s['Title'] = s['Title'].replace(['Mme'], 'Mrs')
    
    #print(s.sample(7))
    #assert 0
    return s.drop(columns='Name').copy()

def codeEmbarked(df):
    '''   '''
    
    s = copy.deepcopy(df)
    
    ordinal = OrdinalEncoder()
    
    return ordinal.fit_transform(s)


def sexTitle_to_OneHot(df):
    '''This version works when you firts split the data set into X, y 
       and dropping 'Survived' from X
    '''
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    
    #d_clean.drop(columns=['Ticket', 'Cabin'], inplace=True)
    s = df.drop(columns=['Ticket', 'Cabin']).copy()
    #print(s.head(16))
    #print(s.columns)
    #s.columns
        
    cat_features = ['Sex', 'Title']
    
    #res_features = ['Age', 'Embarked', 'Pclass', 'Fare', 'IsAlone']
    
    # Pclass  Age  Fare  Embarked  IsAlone
    res_features = ['Pclass', 'Age', 'Fare', 'Embarked', 'IsAlone']
    
    transformer = ColumnTransformer([('cat', encoder, cat_features)], remainder='passthrough')
    
    transformer.fit(s)
    
    tranformed_names = transformer.named_transformers_['cat'].get_feature_names_out(cat_features)
    #tranformed_names
    
    new_names = np.append(tranformed_names, res_features)
    
    #print(new_names)
    #assert 0
    
    return pd.DataFrame(transformer.transform(s), columns=new_names)


def get_x_y(df, y):
    #resives an copy of a df and return X_tr, y_tr
    X_tr = df.drop(columns=[y])
    y_tr = df[y]
    
    return X_tr, y_tr


embarked_func = FunctionTransformer(codeEmbarked, validate=False)
getTitles_trans = FunctionTransformer(getTitles, validate=False)
sexTitle_to_OneHot_trans = FunctionTransformer(sexTitle_to_OneHot, validate=False)
fare_func = FunctionTransformer(encodeFare, validate=False)
age_func = FunctionTransformer(age, validate=False)
familySize_func = FunctionTransformer(familySize, validate=False)
toDF_trans = FunctionTransformer(to_df, validate=False)


