- verbose in pipeline steps to see time

In [2]:
import pandas as pd
import numpy as np
import urllib
import zipfile

In [3]:
import tubular
tubular.__version__

'0.3.0'

# Download dataset from the UCI Machine Learning Repository
## Define data locations

In [4]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
zip_filename = "bank-additional.zip"
data_filename = "bank-additional/bank-additional-full.csv"

## Download and unzip data

In [4]:
r = urllib.request.urlretrieve(data_url, zip_filename)

In [5]:
with zipfile.ZipFile(zip_filename, "r") as zip_ref:
    
    zip_ref.extractall(".")

## Load data

In [5]:
df = pd.read_csv(data_filename, sep=";")
df.shape

(41188, 21)

In [6]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Data prep

In [7]:
df['y'] = df['y'].map({'no': 0, 'yes': 1})

In [8]:
np.random.seed(1)
df["sample"] = np.random.random_sample(df.shape[0])
test_sample_cut_off = 0.8

In [10]:
X_train = df.loc[df["sample"] < test_sample_cut_off].drop(columns = "y")
X_test = df.loc[df["sample"] >= test_sample_cut_off].drop(columns = "y")
y_train = df.loc[df["sample"] < test_sample_cut_off, "y"]
y_test = df.loc[df["sample"] >= test_sample_cut_off, "y"]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32946, 21) (8242, 21) (32946,) (8242,)


# Explore data

In [8]:
from sklearn.pipeline import Pipeline

In [9]:
from tubular.capping import CappingTransformer
from tubular.numeric import CutTransformer, ScalingTransformer
from tubular.mapping import MappingTransformer
from tubular.nominal import GroupRareLevelsTransformer, OneHotEncodingTransformer, MeanResponseTransformer

In [10]:
column_mappings = {
    "default": {"no": 0, "unknown": 1, "yes": 1},
    "housing": {"no": 0, "unknown": 1, "yes": 1},
    "loan": {"no": 0, "unknown": 1, "yes": 1},
    "contact": {"cellular": 0, "telephone": 1},
    "month": {
        "jan": "winter",
        "feb": "winter",
        "mar": "sping",
        "apr": "sping",
        "may": "sping",
        "jun": "summer",        
        "jul": "summer",
        "aug": "summer",
        "sep": "autumn",
        "oct": "autumn",
        "nov": "autumn",
        "dec": "winter",         
    }
}

In [11]:
preprocessing_pipeline = Pipeline(
    steps = [
        (
            "capping",
            CappingTransformer(
                capping_values = {'campaign': [None, 10]}, verbose = False
            )
        ),
        (
            "discretisation",
            CutTransformer(
                column = "emp.var.rate", 
                new_column_name = "emp.var.rate_cut", 
                cut_kwargs = {"bins": 5, "precision": 3}, 
                verbose = False
            )
        ),        
        (
            "standardisation",
            ScalingTransformer(
                columns = ["age"], scaler_type = "standard", verbose = False
            )
        ),
        (
            "mapping",
            MappingTransformer(
                mappings = column_mappings, verbose = False
            )
        ),  
        (
            "rare_category_grouping",
            GroupRareLevelsTransformer(
                columns = ["job", "education"], cut_off_percent = 0.05, verbose = False
            )
        ),    
        (
            "one_hot_encoding",
            OneHotEncodingTransformer(
                columns = ["poutcome", "emp.var.rate_cut"], drop_original = True, verbose = False
            )
        ),
        (
            "mean_response_encoding",
            MeanResponseTransformer(
                columns = ["job", "month", "education", "marital", "day_of_week"]
            )
        )          
    ]
)

In [31]:
preprocessing_pipeline = preprocessing_pipeline.fit(
    X = df.loc[df["sample"] < 0.7].drop(columns = "y"), y = df.loc[df["sample"] < 0.7, "y"]
)



In [33]:
df_transformed_train = preprocessing_pipeline.transform(df.loc[df["sample"]< 0.7].drop(columns = "y"))
df_transformed_test = preprocessing_pipeline.transform(df.loc[df["sample"] >= 0.7].drop(columns = "y"))



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [22]:
model = RandomForestClassifier(random_state = 10)

In [34]:
model.fit(df_transformed_train, df.loc[df["sample"] < 0.7, "y"])

RandomForestClassifier(random_state=10)

In [35]:
preds = model.predict(df_transformed_test)

In [36]:
roc_auc_score(df.loc[df["sample"] >= 0.7, "y"], preds)

0.726308718809964

In [13]:
preprocessing_and_model_pipeline = Pipeline(
    steps = [
        (
            "preprocessing",
            preprocessing_pipeline
        ),
        (
            "model",
            RandomForestClassifier(random_state = 10)
        )
    ]
)

In [40]:
preprocessing_and_model_pipeline = preprocessing_and_model_pipeline.fit(
    X = df.loc[df["sample"] < 0.7].drop(columns = "y"), y = df.loc[df["sample"] < 0.7, "y"]
)



In [41]:
preprocessing_and_model_pipeline.predict(df.loc[df["sample"] < 0.7].drop(columns = "y"))



array([0, 0, 0, ..., 1, 0, 0])

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [18]:
param_grid = {
    "preprocessing__rare_category_grouping__cut_off_percent": [0.005, 0.01, 0.02, 0.1],
    "model__max_depth": [2, 5, 7],
    "model__n_estimators": range(60, 220, 40)
}

In [19]:
random_search = RandomizedSearchCV(
    estimator = preprocessing_and_model_pipeline,
    param_distributions = param_grid,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 3,
    random_state = 1234,
    verbose = True    
)

In [20]:
random_search.fit(
    X = df.loc[df["sample"] < 0.7].drop(columns = "y"), y = df.loc[df["sample"] < 0.7, "y"]
)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('preprocessing',
                                              Pipeline(steps=[('capping',
                                                               CappingTransformer(capping_values={'campaign': [None,
                                                                                                               10]})),
                                                              ('discretisation',
                                                               CutTransformer(column='emp.var.rate',
                                                                              cut_kwargs={'bins': 5,
                                                                                          'precision': 3},
                                                                              new_column_name='emp.var.rate_cut')),
                                                              ('standardisation',
              

In [23]:
random_search.best_params_

{'preprocessing__rare_category_grouping__cut_off_percent': 0.005,
 'model__n_estimators': 140,
 'model__max_depth': 2}