In [73]:
!python -m pip install scikit-learn imblearn scipy matplotlib numpy pandas pandas-profiling seaborn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 4.1 MB/s 
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [1]:
#notebook support kernel
!python -m pip install ipykernel 



In [1]:
!python --version

Python 3.8.6


In [2]:
!pwd

/Users/aj/projects/sklearn/notebooks


In [3]:
import numpy as np
import pandas as pd
import pandas_profiling

import pickle

from sklearn import datasets

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import metrics


In [3]:
# Load the dataset and examine

data = datasets.load_iris()

print(type(data))
for attr in dir(data):
    print(attr)



<class 'sklearn.utils.Bunch'>
DESCR
data
feature_names
filename
frame
target
target_names


In [4]:
# Detailed examination
for attr in dir(data):
    print(attr, getattr(data,attr))

DESCR .. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)

In [93]:
# split data into train and test sets
seed = 7
test_size = 0.33

x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=test_size, random_state=seed)

In [94]:
# Parameter search grid can have info for several classifier algos; pipeline defintion picks which one
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},

    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(2,3))}
]

In [95]:
# Define pipeline
pipeline = Pipeline([
      ('feature_selection', SelectKBest(chi2, k=2)),
      ('classifier', RandomForestClassifier())
    ])

# Create grid search object
model = GridSearchCV(pipeline, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit grid search on training data
model.fit(x_train, y_train)

# Export the classifier to a file
with open('../models/model02.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    2.4s finished


In [96]:
# Make predictions
y_true = y_test
y_pred = model.predict(x_test)


In [97]:
# Show confusion matrix with labels

unique_label = np.unique([y_true, y_pred])
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_true, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label]
)
print(cmtx)

        pred:0  pred:1  pred:2
true:0      14       0       0
true:1       0      16       2
true:2       0       2      16


In [98]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.89      0.89      0.89        18
           2       0.89      0.89      0.89        18

    accuracy                           0.92        50
   macro avg       0.93      0.93      0.93        50
weighted avg       0.92      0.92      0.92        50



In [52]:
# ROC
# Doesn't work for multiclass problems, only binary classifiers
# metrics.roc_auc_score(y_test, y_pred)


In [99]:
# Get predictions & probabilities on a new data set

x_new = [[6, 3, 1.8, 1.4], [4, 3, 1.5, 0.2]]
y_new = model.predict(x_new)
prob_new = model.predict_proba(x_new)

for i in range(len(x_new)):
	print("X=%s, Predicted=%s, Probabilities=%s" % (x_new[i], y_new[i], prob_new[i]))



X=[6, 3, 1.8, 1.4], Predicted=1, Probabilities=[0.4 0.6 0. ]
X=[4, 3, 1.5, 0.2], Predicted=0, Probabilities=[1. 0. 0.]


In [100]:
# List of attributes of model

for attr in dir(model):
    print(attr)


__abstractmethods__
__class__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__getstate__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__setstate__
__sizeof__
__str__
__subclasshook__
__weakref__
_abc_impl
_check_is_fitted
_check_n_features
_estimator_type
_format_results
_get_param_names
_get_tags
_more_tags
_pairwise
_repr_html_
_repr_html_inner
_repr_mimebundle_
_required_parameters
_run_search
_validate_data
best_estimator_
best_index_
best_params_
best_score_
classes_
cv
cv_results_
decision_function
error_score
estimator
fit
get_params
iid
inverse_transform
multimetric_
n_features_in_
n_jobs
n_splits_
param_grid
pre_dispatch
predict
predict_log_proba
predict_proba
refit
refit_time_
return_train_score
score
scorer_
scoring
set_params
transform
verbose


In [5]:
click_data = pd.read_csv('../data/ready/sample_from_GA.csv', skiprows=6)


In [6]:
print(click_data.head())
print('Rows: ' + str(len(click_data)))
print(click_data.describe())



    Source Device Category    Age                        Region  \
0  Taboola         desktop  25-34               New South Wales   
1   google         desktop  45-54  Australian Capital Territory   
2  MNative         desktop  35-44               New South Wales   
3     bing         desktop  55-64  Australian Capital Territory   
4  MNative         desktop    65+             Western Australia   

           User Type Home Loan GTS (Goal 1 Conversion Rate)  Users  
0        New Visitor                                 33.85%     65  
1  Returning Visitor                                 30.36%     43  
2  Returning Visitor                                 30.36%     56  
3        New Visitor                                 23.21%     56  
4  Returning Visitor                                 21.31%     61  
Rows: 10
           Users
count  10.000000
mean   54.000000
std    10.198039
min    43.000000
25%    43.000000
50%    56.000000
75%    61.000000
max    69.000000


In [18]:
profile = click_data.profile_report(title='Profiling Report')
profile.to_file(output_file="profiling.html")

Summarize dataset: 100%|██████████| 32/32 [00:22<00:00,  1.41it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 80.13it/s]


In [77]:
# define the independent variables [lists]
numeric_features = ['Time_of_day', 'position', 'rate_var', 'Big4Weight', 'TotalImpressions']
categorical_features = ['Region', 'weekday', 'device', 'Browser_OS', 'provider_name', 'click_type', 'experts_choice', 'LoanType']
# define the target variable 'string'
target_column = 'target'


In [78]:
# create feature data and target data
data = click_data[numeric_features + categorical_features + ['target']]
target = click_data[target_column]

In [108]:
# split data into train and test sets
seed = 7
test_size = 0.33

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=seed)

In [109]:
# column transformation steps

### TBC Should these be done before the train/test split...???

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    #('scaler', StandardScaler()),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [113]:
# imbalanced class steps

oversample = SMOTE(sampling_strategy=0.1) # resulting minority class / majority class
undersample = RandomUnderSampler(sampling_strategy=0.4) # resulting minority class / majority class


In [121]:
# Parameter search grid can have info for several classifier algos; pipeline defintion picks which one
param_grid = [
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : [100, 120, 300, 500],
    'classifier__min_samples_leaf' : [1, 2, 5]},
    {'classifier' : [LogisticRegression()],
    'classifier__penalty' : ['elasticnet'],
    'classifier__solver' : ['saga']}
]

In [115]:
# Define pipeline

# we are using SMOTE oversampling, so we need an imbalanced-ready pipeline not the standard sklearn pipeline
from imblearn.pipeline import Pipeline as imbPipeline

pipeline = imbPipeline([
     ('preprocessor', preprocessor),
     ('over', oversample),
     ('under', undersample),
     # ('feature_selection', SelectKBest(chi2, k=2)),
     ('classifier',LogisticRegression())
     # ('classifier', RandomForestClassifier())
    ])

# Create grid search object
model = GridSearchCV(pipeline, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit grid search on training data
model.fit(x_train, y_train)

# Export the classifier to a file
with open('../models/model03.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  9.5min finished


In [116]:
# Make predictions
y_true = y_test
y_pred = model.predict(x_test)

In [117]:
print(y_true.shape)
print(y_pred.shape)

(58579,)
(58579,)


In [118]:
# Show confusion matrix with labels

unique_label = np.unique([y_true, y_pred])
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_true, y_pred, labels=unique_label), 
    index=['true:{:}'.format(x) for x in unique_label], 
    columns=['pred:{:}'.format(x) for x in unique_label]
)
print(cmtx)

        pred:0  pred:1
true:0   56938     502
true:1    1102      37


In [119]:
p = metrics.classification_report(y_true, y_pred)
print(p)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     57440
           1       0.07      0.03      0.04      1139

    accuracy                           0.97     58579
   macro avg       0.52      0.51      0.52     58579
weighted avg       0.96      0.97      0.97     58579



In [None]:
experimentList = [pipeline, model, cmtx, p]

# Export the classifier to a file
with open('../models/model03.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)