## Environment

In [3]:
!python -m pip install scikit-learn==0.24.0 imblearn scipy matplotlib==3.3.3 numpy pandas pandas-profiling seaborn==0.10.1

Collecting scikit-learn==0.24.0
  Downloading scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 8.9 MB/s eta 0:00:01     |████████████████▎               | 11.3 MB 4.1 MB/s eta 0:00:03
Collecting matplotlib==3.3.3
  Downloading matplotlib-3.3.3-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 6.7 MB/s eta 0:00:01
Collecting seaborn==0.10.1
  Downloading seaborn-0.10.1-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 18.0 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn, matplotlib, seaborn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
  Attempting uninstall: matplotlib
    Found existing installat

In [None]:
#notebook support kernel not required; we are running inside docker jupyter/scipy-notebook

#!python -m pip install ipykernel 

In [4]:
!python --version

Python 3.7.6


In [5]:
!pwd

/home/jovyan/work/notebooks


In [6]:
import numpy as np
import pandas as pd
import pandas_profiling

import pickle

from sklearn import datasets

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import metrics


## Data

In [7]:
# Load the dataset and examine

data = datasets.load_iris()

print(type(data))
for attr in dir(data):
    print(attr)



<class 'sklearn.utils.Bunch'>
DESCR
data
feature_names
filename
frame
target
target_names


In [8]:
# Detailed examination
for attr in dir(data):
    print(attr, getattr(data,attr))

DESCR .. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)

## Features

## Model


In [9]:
# split data into train and test sets
seed = 7
test_size = 0.33

x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=test_size, random_state=seed)

In [10]:
# Parameter search grid can have info for several classifier algos; pipeline defintion picks which one
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},

    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(2,3))}
]

In [11]:
# Define pipeline
pipeline = Pipeline([
      ('feature_selection', SelectKBest(chi2, k=2)),
      ('classifier', RandomForestClassifier())
    ])

# Create grid search object
model = GridSearchCV(pipeline, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit grid search on training data
model.fit(x_train, y_train)

# Export the classifier to a file
with open('../models/model02.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


## Evaluate

In [12]:
# Make predictions
y_true = y_test
y_pred = model.predict(x_test)


In [13]:
# Show confusion matrix with labels

unique_label = np.unique([y_true, y_pred])
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_true, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label]
)
print(cmtx)

        pred:0  pred:1  pred:2
true:0      14       0       0
true:1       0      16       2
true:2       0       1      17


In [14]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.89      0.91        18
           2       0.89      0.94      0.92        18

    accuracy                           0.94        50
   macro avg       0.95      0.94      0.94        50
weighted avg       0.94      0.94      0.94        50



In [15]:
# ROC
# Doesn't work for multiclass problems, only binary classifiers
# metrics.roc_auc_score(y_test, y_pred)


In [16]:
# Get predictions & probabilities on a new data set

x_new = [[6, 3, 1.8, 1.4], [4, 3, 1.5, 0.2]]
y_new = model.predict(x_new)
prob_new = model.predict_proba(x_new)

for i in range(len(x_new)):
	print("X=%s, Predicted=%s, Probabilities=%s" % (x_new[i], y_new[i], prob_new[i]))



X=[6, 3, 1.8, 1.4], Predicted=0, Probabilities=[0.8 0.2 0. ]
X=[4, 3, 1.5, 0.2], Predicted=0, Probabilities=[1. 0. 0.]


In [23]:
# List of attributes of model

#for attr in dir(model):
#    print(attr)


In [19]:
click_data = pd.read_csv('../data/processed/Dfx_data_20191107.csv')


In [20]:
print(click_data.head())
print('Rows: ' + str(len(click_data)))
print(click_data.describe())



   Unnamed: 0 click_type  Time_of_day weekday  position  device  \
0           1        GTS            3  Friday         4  mobile   
1           2        GTS            3  Friday         5  mobile   
2           3        GTS            3  Friday         6  mobile   
3           4        GTS            3  Friday         7  mobile   
4           5        GTS            6  Friday         1  mobile   

         provider_name  rate  experts_choice           Region  rate_rank  \
0  Newcastle Permanent  3.89            True         Victoria          2   
1                   ME  3.89           False         Victoria          3   
2                  NAB  3.99           False         Victoria          4   
3                  AMP  3.74           False         Victoria          1   
4         loans.com.au  3.64            True  South Australia          3   

   min_interest  TotalImpressions  rate_var  LoanType  Big4Weight  target  \
0          3.74                 4      0.15  Variable        0.

In [21]:
profile = click_data.profile_report(title='Profiling Report')
profile.to_file(output_file="profiling.html")

Summarize dataset:   0%|          | 0/31 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
# define the independent variables [lists]
numeric_features = ['Time_of_day', 'position', 'rate_var', 'Big4Weight', 'TotalImpressions']
categorical_features = ['Region', 'weekday', 'device', 'Browser_OS', 'provider_name', 'click_type', 'experts_choice', 'LoanType']

# define the target variable 'string'
target_column = 'target'

In [25]:
# create feature data and target data
data = click_data[numeric_features + categorical_features]
target = click_data[target_column]

In [26]:
# split data into train and test sets
seed = 7
test_size = 0.33

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=seed)

In [27]:
# column transformation steps
### to be applied to any dataset during the process

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    #('scaler', StandardScaler()),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [28]:
# imbalanced class steps

oversample = SMOTE(sampling_strategy=0.1) # resulting minority class / majority class
undersample = RandomUnderSampler(sampling_strategy=0.4) # resulting minority class / majority class


In [29]:
# Parameter search grid can have info for several classifier algos; pipeline defintion picks which one
param_grid = [
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : [100, 120, 300, 500],
    'classifier__min_samples_leaf' : [1, 2, 5]},
    {'classifier' : [LogisticRegression()],
    'classifier__penalty' : ['elasticnet'],
    'classifier__solver' : ['saga']}
]

# also consider using HyperOpt


In [None]:
# Define pipeline

# we are using SMOTE oversampling, so we need an imbalanced-ready pipeline not the standard sklearn pipeline
from imblearn.pipeline import Pipeline as imbPipeline

pipeline = imbPipeline([
     ('preprocessor', preprocessor),
     ('over', oversample),
     ('under', undersample),
     # ('feature_selection', SelectKBest(chi2, k=2)),
     ('classifier',LogisticRegression())
     # ('classifier', RandomForestClassifier())
    ])

# Create grid search object
model = GridSearchCV(pipeline, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit grid search on training data
model.fit(x_train, y_train)

# Export the classifier to a file
with open('../models/model03.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)

Fitting 5 folds for each of 13 candidates, totalling 65 fits


In [None]:
# Make predictions
y_true = y_test
y_pred = model.predict(x_test)

In [None]:
print(y_true.shape)
print(y_pred.shape)

In [None]:
# Show confusion matrix with labels

unique_label = np.unique([y_true, y_pred])
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_true, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label]
)
print(cmtx)

In [None]:
p = metrics.classification_report(y_true, y_pred)
print(p)

In [None]:
experimentList = [pipeline, model, cmtx, p]

# Export the classifier to a file
with open('../models/model03.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)