# Pipeline 1: Basic Crude pipeline

In [15]:
import sys
import os
import pandas as pd

module_path = '../src/'
sys.path.append(module_path)

from preprocessing.transformers import *

In [16]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
dfX = df.drop('Survived', axis=1)
dfy = df.Survived

# Pipeline outline:

1. Drop Name, Ticket - requires Feature Engineering
2. OneHotEncoder for Sex, Embarked
3. Drop Cabin - requires Feature Engineering/(?And Not Imputation)
4. Impute Age with mean
5. Drop NaN rows in Embarked
6. Scaling Age and Fare


## Preprocessing_pre 
These steps include the sample size altering steps.

In [29]:
from sklearn.pipeline import Pipeline
from utils import print_params

In [38]:
pre1 = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

dfX, dfy = pre1.fit_transform(dfX, dfy)

### Hyperparameters

In [32]:
print_params(pre1)

['memory', 'steps', 'verbose', 'nan_drpr', 'nan_drpr__key']


## Preprocessing
These include the sklearn compatible steps but can be divided into sequential and parallel steps

In [33]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [77]:
pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
], 'passthrough')



precomb2 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare'])
], 'passthrough')


In [84]:
precomb2.fit_transform(dfX)

array([[1., 0., 1., ..., 3., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 1., ..., 3., 0., 0.],
       ...,
       [0., 0., 1., ..., 3., 1., 2.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 1., 0., ..., 3., 0., 0.]])

### Hyperparameters

Note that since I didn't use ColumnDropper I can't choose which columns to drop as a parameter, but I can use modify_transformer_cols

In [85]:
print_params(precomb2)

['n_jobs',
 'remainder',
 'sparse_threshold',
 'transformer_weights',
 'transformers',
 'verbose',
 'clmn_drpr',
 'enc',
 'imp_scaler',
 'enc__categories',
 'enc__drop',
 'enc__dtype',
 'enc__handle_unknown',
 'enc__sparse',
 'imp_scaler__memory',
 'imp_scaler__steps',
 'imp_scaler__verbose',
 'imp_scaler__imp',
 'imp_scaler__scaler',
 'imp_scaler__imp__add_indicator',
 'imp_scaler__imp__copy',
 'imp_scaler__imp__fill_value',
 'imp_scaler__imp__missing_values',
 'imp_scaler__imp__strategy',
 'imp_scaler__imp__verbose',
 'imp_scaler__scaler__copy',
 'imp_scaler__scaler__with_mean',
 'imp_scaler__scaler__with_std']


## Model fitting

In [89]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [87]:
modelpipe = Pipeline([
    ('clf', KNeighborsClassifier())
])

pipe = Pipeline([
    ('preprocess', precomb2),
    ('model', modelpipe)
])

In [88]:
cross_val_score(pipe, dfX, dfy, cv=5).mean()

0.50069193169555

In [90]:
param_grid = {'model__clf__n_neighbors': range(1, 11, 2)}

In [95]:
grid = GridSearchCV(pipe, param_grid)
grid.fit(dfX, dfy)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('clmn_drpr',
                                                                         'drop',
                                                                         ['Name',
                                                                          'Ticket',
                                                                          'Cabin']),
                                                                        ('enc',
                                              

In [96]:
print(grid.best_params_)
print(grid.best_score_)

{'model__clf__n_neighbors': 7}
0.5074335047292579


In [None]:
params_