In [1]:
"""
Example of engineering features with Scikit only
"""
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [2]:
#
# 1. get data
#
df = pd.read_csv('/Users/naimboraatlay/SpicedBC/convolutionalcurry-student-code/week2/train.csv', index_col=0)
del df['Ticket']
del df['Cabin']

X = df.iloc[:, 1:]  # remove first column
y = df['Survived']



In [3]:
#
# 2. Define feature engineering steps
# 
def name_length(df):
    """function that gets a DataFrame and outputs a DataFrame"""
    # input: df
    # output: numpy matrix
    length = df[df.columns[0]].str.len()
    return length.values.reshape(-1, 1) # makes a matrix out of the result

# you could run this function like this:
#d = pd.DataFrame([['Mr. Bean'], ['Mrs. Dr. Bean']])
#print(name_length(d))

fill_embarked = make_pipeline(
             SimpleImputer(strategy='most_frequent'),
             OneHotEncoder(sparse=False, handle_unknown='ignore')
)

In [6]:
# alternative: make_column_transformer(...) without the strings
trans = ColumnTransformer([
       ('fill_embarked', fill_embarked, ['Embarked']),
       ('bins         ', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile'), ['Fare']),
       ('name         ', FunctionTransformer(name_length), ['Name']),
       ('other cats   ', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Sex', 'Pclass']),
       ('do_nothing   ', 'passthrough', ['Parch', 'SibSp']),
])

In [7]:
#
# 3. fit and transform everything
#
model = make_pipeline(
       trans,
       MinMaxScaler(),
       LogisticRegression()
)
model.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('fill_embarked',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Embarked']),
                                                 ('bins         ',
                                                  KBinsDiscretizer(encode='onehot-dense',
                                                                   n_bins=3),
                                                  ['Fare']),
                                                 ('n

In [8]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [9]:
print('training acc:', round(model.score(X, y), 3))

training acc: 0.797
