# `Pipeline`- helper functions

In [1]:
from src.pipeline import dp1
from src.pipeline import dp2
from src.pipeline import DataPipeline

import joblib

available_pipelines= {
    'dp1': dp1,
    'dp2': dp2
}

## `config_local`

In [3]:
import yaml
import os

HOME = os.environ.get('HOME')
APP_NAME = 'titansurv'
APP_CONFIG_FILE = os.path.join(HOME, f'.{APP_NAME}')

def set_config_file_location(config_file):    
    with open(APP_CONFIG_FILE, "w") as f:
        yaml.dump({'CONFIG_FILE': config_file}, f, default_flow_style=False)

def get_config():
    with open(APP_CONFIG_FILE, "r") as f:
        config_params = yaml.safe_load(f)

    return config_params


## `create_serialized_model`

In [4]:
def create_serialized_model(pipe_name, outfile, **kwargs):
    dp = available_pipelines[pipe_name]
    print(f'Creating serialized model in {outfile}')
    joblib.dump(dp, outfile, **kwargs)

In [5]:
create_serialized_model('dp1', '../../models/dp1.pkl')

Creating serialized model in ../../models/dp1.pkl


## `get_available_datapipe`

In [1]:
from src.config import get_config

In [5]:
config_params = get_config()

In [6]:
config_params

{'PROJECT_DIR': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival',
 'RAW_TRAIN_DATA_FILE': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival/data/raw/train.csv',
 'RAW_TEST_DATA_FILE': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival/data/raw/test.csv',
 'MODELS_DIR': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival/pickled/models',
 'SEARCH_DIR': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival/pickled/search',
 'HYPERPARAMS_DIR': '/home/abhi/main/coding/Python-code/mygit/Titanic-Survival/pickled/hyperparams'}

## `update_hyperparam_values`

In [6]:
import joblib
def update_hyperparam_values(update_dict, file, **kwargs):
    param_dict = joblib.load(file)
    param_dict.update(update_dict)
    joblib.dump(param_dict, file)


In [7]:
from src.utils import load_data

In [8]:
load_data()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## `tune_mlmodel.py`

In [None]:
I want the following features:
    
1. Provide a datapipe

In [33]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pandas as pd
from src.pipeline import dp1
from src.pipeline import dp2
from src.config import RAW_TRAIN_DATA_FILE, RAW_TEST_DATA_FILE, HYPERPARAMS_MLMODEL_FILE
from src.utils import load_data
from sklearn.base import clone
import joblib

available_pipelines= {
    'dp1': dp1,
    'dp2': dp2
}



def load_hyperparams_mlmodel():        
    return joblib.load(HYPERPARAMS_MLMODEL_FILE)

def tune_mlmodel(datapipe, hyperparams='autoload',
                 mlmodels='all', search_type='rand', data='autoload', **kwargs):
    '''
    Tune the machine learning model in a pipeline
    
    Parameters
    ----------
    datapipe: str or DataPipeline
        if str is supplied available pipeline are loaded
        
    data: str or pd.DataFrame, default: 'autoreload'
        
    hyperparams: dict or list of dicts, default: 'autoload'
        'autoload' will load the param_dict from the HYPERPARAMS_MLMODEL_FILE
        else the supply a list of dicts
        
    mlmodels: list default: 'all'
        provide a list of mlmodel names to tune
        'all' will tune all mlmodels
        
    search: str default: 'rand'
        the type of search to perform i.e. 'rand' -> RandomizedSearchCV, 'grid' -> GridSearchCV
    '''
    
    if type(datapipe) is str:
        dp = available_pipelines[datapipe]
    else:
        dp = clone(datapipe)
        
    if hyperparams == 'autoreload':
        hyperparams = load_hyperparams_mlmodel()

    if mlmodels != 'all':    
        if type(hyperparams) is dict:
            param_grid = [v for k, v in hyperparams.items() if k in mlmodels]
        else:
            param_grid = hyperparams

    pipe = dp.get_pipeline()

    if search_type == 'grid':
        search = GridSearchCV(pipe, param_grid, **kwargs)
    elif search_type == 'rand':
        search = RandomizedSearchCV(pipe, param_grid, **kwargs)
        
        
    if data == 'autoload':
        df = load_data('train')
        dp.set_data(df)
    else:
        dp.set_data(data)
        
    dfX_pre, dfy_pre = dp.prepare()
    
    return search.fit(dfX_pre, dfy_pre)




In [34]:
# from src.tune import tune_mlmodel
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold()

In [35]:
from src.utils import print_params

In [36]:
print_params(dp2)

['data',
 'description',
 'mlmodel',
 'prepare_data__memory',
 'prepare_data__steps',
 'prepare_data__verbose',
 'prepare_data__nan_drpr',
 'prepare_data__nan_drpr__key',
 'prepare_data',
 'preprocess_data',
 'warn',
 'ycol',
 'preprocess',
 'ml',
 'preprocess__memory',
 'preprocess__steps',
 'preprocess__verbose',
 'preprocess__clmn_trnsfrm',
 'preprocess__scale',
 'preprocess__clmn_trnsfrm__n_jobs',
 'preprocess__clmn_trnsfrm__remainder',
 'preprocess__clmn_trnsfrm__sparse_threshold',
 'preprocess__clmn_trnsfrm__transformer_weights',
 'preprocess__clmn_trnsfrm__transformers',
 'preprocess__clmn_trnsfrm__verbose',
 'preprocess__clmn_trnsfrm__enc',
 'preprocess__clmn_trnsfrm__imp',
 'preprocess__clmn_trnsfrm__pre_Name',
 'preprocess__clmn_trnsfrm__pre_Cabin',
 'preprocess__clmn_trnsfrm__pre_Ticket',
 'preprocess__clmn_trnsfrm__Pre_SibSp',
 'preprocess__clmn_trnsfrm__Pre_Parch',
 'preprocess__clmn_trnsfrm__enc__categories',
 'preprocess__clmn_trnsfrm__enc__drop',
 'preprocess__clmn_trns

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [38]:
search = tune_mlmodel('dp2', [{'ml':[KNeighborsClassifier()], 
                               'preprocess__scale': ['passthrough', StandardScaler()]}], 
                      search_type='grid', cv=skf)

In [1]:
from src.tune import load_hyperparams_mlmodel

In [2]:
load_hyperparams_mlmodel()

{'LogisticRegression_NoReg': {'ml': [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False)],
  'ml__solver': ['liblinear']},
 'LogisticRegression_l1l2': {'ml': [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False)],
  'ml__penalty': ['l1', 'l2'],
  'ml__C': [0.01, 0.1, 1, 10, 100],
  'ml__solver': ['liblinear']},
 'LogisticRegression_elasnet': {'ml': [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,


In [31]:
search.best_score_

0.8178251761569225

In [32]:
res = pd.DataFrame(search.cv_results_)
res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ml,param_preprocess__scale,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.025353,0.003559,0.020757,0.00051,"KNeighborsClassifier(algorithm='auto', leaf_si...",passthrough,"{'ml': KNeighborsClassifier(algorithm='auto', ...",0.623596,0.455056,0.466292,0.477528,0.683616,0.541218,0.093976,2
1,0.024362,0.000783,0.023042,0.001091,"KNeighborsClassifier(algorithm='auto', leaf_si...","StandardScaler(copy=True, with_mean=True, with...","{'ml': KNeighborsClassifier(algorithm='auto', ...",0.814607,0.814607,0.814607,0.780899,0.864407,0.817825,0.0267,1
