### High level

1. Read file from GCS bucket
2. Normalize the features
    - data looks kinda normalized already based on other notebooks in kaggle. will skip.
3. Create a sample/hold out (test) set. The test set will be used as future input.
4. Create a simple model (doesn't matter if it's bad).
5. Save model to pickle file to GCS
6. Save model to model registry?
7. Test model on some rows from test set
8. Streamlit

In [1]:
import pickle
import pandas as pd
import numpy as np

from google.cloud import storage
from sklearn.metrics import accuracy_score, auc, balanced_accuracy_score, confusion_matrix, f1_score, precision_score, average_precision_score, roc_auc_score,  recall_score,  precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
# read file
bucket_name = 'machine-learning-workspace'
path = 'cc-fraud/data/creditcard.csv'

client = storage.Client()
bucket = client.bucket(bucket_name)

df = pd.read_csv(f'gs://{bucket_name}/{path}')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# 80% sample
sdf = df.sample(frac=0.8, replace=False, random_state=1).reset_index(drop=True)
print(sdf.shape)
sdf.head()

(227846, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,119907.0,-0.611712,-0.769705,-0.149759,-0.224877,2.028577,-2.019887,0.292491,-0.52302,0.358468,...,-0.075208,0.045536,0.380739,0.02344,-2.220686,-0.201146,0.066501,0.22118,1.79,0
1,78340.0,-0.814682,1.319219,1.329415,0.027273,-0.284871,-0.653985,0.321552,0.435975,-0.704298,...,-0.128619,-0.368565,0.09066,0.401147,-0.261034,0.080621,0.162427,0.059456,1.98,0
2,82382.0,-0.318193,1.118618,0.969864,-0.127052,0.569563,-0.532484,0.706252,-0.064966,-0.463271,...,-0.305402,-0.774704,-0.123884,-0.495687,-0.018148,0.121679,0.24905,0.092516,0.89,0
3,31717.0,-1.328271,1.018378,1.775426,-1.574193,-0.117696,-0.457733,0.681867,-0.031641,0.383872,...,-0.220815,-0.419013,-0.239197,0.009967,0.232829,0.814177,0.098797,-0.004273,15.98,0
4,80923.0,1.276712,0.61712,-0.578014,0.879173,0.061706,-1.472002,0.373692,-0.287204,-0.084482,...,-0.160161,-0.430404,-0.076738,0.258708,0.55217,0.370701,-0.034255,0.041709,0.76,0


In [5]:
# Create test set
testdf = df[~df.index.isin(sdf.index)].reset_index(drop=True)
print(testdf.shape)
testdf.head()

(56961, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,145249.0,2.152696,-0.036161,-2.231811,0.091766,0.537612,-1.368103,0.613327,-0.455252,0.291814,...,0.017153,0.063242,-0.034561,-0.626866,0.249213,0.773931,-0.137115,-0.090611,14.95,0
1,145249.0,-4.034795,2.305079,-1.461693,-0.729887,-1.52875,-1.225679,-0.893354,1.622522,1.291998,...,-0.392557,-0.787599,0.343468,-0.090331,0.248287,-0.238524,0.266484,-0.062236,7.7,0
2,145249.0,-1.668741,1.168055,0.249642,-1.268497,0.785923,-0.663959,0.859433,0.068111,-0.144183,...,-0.247544,-0.592537,-0.286694,-0.378856,-0.077429,0.067608,-0.278962,-0.064193,6.99,0
3,145250.0,-0.550678,-0.429004,-1.291893,-0.414409,-0.292229,0.071843,2.426068,-0.21273,0.412374,...,0.003032,-0.645783,0.877016,-1.228529,-0.036281,-0.11061,-0.09838,0.095985,460.71,0
4,145250.0,1.84691,0.143301,-1.171846,1.570946,0.076854,-0.85813,0.164378,-0.251494,0.442113,...,-0.018428,0.048949,0.105389,-0.119156,-0.014024,-0.705045,0.044595,0.000922,68.0,0


In [6]:
X = sdf.iloc[:, 1:30]
y = sdf.iloc[:, 30:31]
X.head(), y.head()

(         V1        V2        V3        V4        V5        V6        V7  \
 0 -0.611712 -0.769705 -0.149759 -0.224877  2.028577 -2.019887  0.292491   
 1 -0.814682  1.319219  1.329415  0.027273 -0.284871 -0.653985  0.321552   
 2 -0.318193  1.118618  0.969864 -0.127052  0.569563 -0.532484  0.706252   
 3 -1.328271  1.018378  1.775426 -1.574193 -0.117696 -0.457733  0.681867   
 4  1.276712  0.617120 -0.578014  0.879173  0.061706 -1.472002  0.373692   
 
          V8        V9       V10  ...       V20       V21       V22       V23  \
 0 -0.523020  0.358468  0.070050  ... -0.196039 -0.075208  0.045536  0.380739   
 1  0.435975 -0.704298 -0.600684  ... -0.009041 -0.128619 -0.368565  0.090660   
 2 -0.064966 -0.463271 -0.528357  ...  0.206028 -0.305402 -0.774704 -0.123884   
 3 -0.031641  0.383872  0.334853  ...  0.315823 -0.220815 -0.419013 -0.239197   
 4 -0.287204 -0.084482 -0.696578  ... -0.105286 -0.160161 -0.430404 -0.076738   
 
         V24       V25       V26       V27       V28  

In [7]:
# Let's split our dataset and see what it looks like:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.25)
training_fraud = sum(y_train.values)
training_fraud_pct = sum(y_train.values)/len(y_train.values)*100
test_fraud = sum(y_test.values)
test_fraud_pct = sum(y_test.values)/len(y_test.values)*100
print("X train: {}\nX test:  {}\ny_train: {}\ny test:  {} \nFraud in train set: {},   {:2f}%\nFraud in test set:  {},  {:2f}%\n".format(X_train.shape, 
                                                                                                                                        X_test.shape, 
                                                                                                                                        y_train.shape, 
                                                                                                                                        y_test.shape, 
                                                                                                                                        training_fraud[0], training_fraud_pct[0],
                                                                                                                                        test_fraud[0], test_fraud_pct[0]))

X train: (170884, 29)
X test:  (56962, 29)
y_train: (170884, 1)
y test:  (56962, 1) 
Fraud in train set: 298,   0.174387%
Fraud in test set:  100,  0.175556%



In [9]:
#let's start over...
clf = RandomForestClassifier(n_jobs=-1,n_estimators=10, verbose=1)
# ... but this time, let's fit our model using sklearn.model_selection.cross_val_score
cvs = cross_val_score(clf,X=X_train, y=y_train, scoring='average_precision')

  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]

In [10]:
print(f'Using {len(cvs)} trials:\n {cvs}')
print(f'Average:{np.mean(cvs)}')

Using 5 trials:
 [0.76986674 0.90543282 0.81094597 0.77647823 0.85076658]
Average:0.8226980686425586


In [11]:
# clf = RandomForestClassifier(n_jobs=-1, verbose=0) # these settings use multiple cores, and provide more command line feedback
# ... but this time, let's fit our model using sklearn.model_selection.cross_validate
cvs = cross_validate(clf,X=X_train, y=y_train, scoring=['accuracy','average_precision','balanced_accuracy','f1','precision','recall','roc_auc'])

  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [12]:
for k, v in cvs.items():
    print(f'{k.replace("test_",""):23}{v}   Avg:{"":4}{np.mean(v):.2f}' )

fit_time               [5.33484316 5.54818225 6.23640347 5.407897   5.63912463]   Avg:    5.63
score_time             [0.12945843 0.13451457 0.12984872 0.1281352  0.12826848]   Avg:    0.13
accuracy               [0.99941481 0.99956111 0.99941481 0.99947333 0.99944405]   Avg:    1.00
average_precision      [0.78311415 0.88784723 0.8031288  0.78236291 0.85910544]   Avg:    0.82
balanced_accuracy      [0.881268   0.88331868 0.85828937 0.85831868 0.84744297]   Avg:    0.87
f1                     [0.81818182 0.85981308 0.81132075 0.82692308 0.81188119]   Avg:    0.83
precision              [0.88235294 0.9787234  0.93478261 0.97727273 0.97619048]   Avg:    0.95
recall                 [0.76271186 0.76666667 0.71666667 0.71666667 0.69491525]   Avg:    0.73
roc_auc                [0.93179727 0.95809885 0.91631078 0.89969077 0.94891507]   Avg:    0.93


In [24]:
# save the best model
import gcsfs

fs = gcsfs.GCSFileSystem(project='machine-learning-workspace')

bucket_name = 'machine-learning-workspace'
# bucket = client.bucket(bucket_name)
path = 'cc-fraud/models'
filename = 'clf_model.pkl'

with fs.open(f'gs://{bucket_name}/{path}/{filename}', 'wb') as f:
    pickle.dump(clf, f)

### Model registry

Referring to this:
https://github.com/kylegallatin/designing-ml-systems/blob/33a25f58fac160b60972aa3b3611d01b68e48964/examples/e2e-vertex/README.md

__Run these commands in cloud shell:__

```
gcloud iam service-accounts create sa-vertex-predictions     --display-name="Service account for registering models with Vertex"     --project=$GOOGLE_CLOUD_PROJECT

gcloud projects add-iam-policy-binding $GOOGLE_CLOUD_PROJECT     --member="serviceAccount:sa-vertex-predictions@$GOOGLE_CLOUD_PROJECT.iam.gserviceaccount.com"     --role="roles/storage.admin"

gcloud iam service-accounts add-iam-policy-binding sa-vertex-predictions@$GOOGLE_CLOUD_PROJECT.iam.gserviceaccount.com   --member="user:${USER_EMAIL}"   --role="roles/iam.serviceAccountUser"
```

__Next register the model, create an endpoint, and deploy the model to that endpoint.__

```
# replace with your own
ENDPOINT_ID=6491457276761604096
MODEL_ID=01234567890

# register the model with the model registry
gcloud ai models upload   --region=us-central1   --display-name=my-scikit-learn-model   --container-image-uri=us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest   --artifact-uri=gs://${USER}/vertex_models/sklearn

# creating an endpoint
gcloud ai endpoints create   --region=us-central1   --display-name=my-scikit-learn-model-ep

# deploying a model to that endpoint
gcloud ai endpoints deploy-model $ENDPOINT_ID   --region=us-central1   --model=$MODEL_ID   --display-name=my-scikit-learn-model-deploy   --machine-type=n2-standard-4
```

[1;31mERROR:[0m (gcloud.iam.service-accounts.create) The project property is set to the empty string, which is invalid.
To set your project, run:

  $ gcloud config set project PROJECT_ID

or to unset it, run:

  $ gcloud config unset project
[1;31mERROR:[0m (gcloud.projects.add-iam-policy-binding) argument PROJECT_ID: Must be specified.
Usage: gcloud projects add-iam-policy-binding PROJECT_ID --member=PRINCIPAL --role=ROLE [optional flags]
  optional flags may be  --condition | --condition-from-file | --help

For detailed information on this command and its flags, run:
  gcloud projects add-iam-policy-binding --help
[1;31mERROR:[0m (gcloud.iam.service-accounts.add-iam-policy-binding) INVALID_ARGUMENT: Unknown error
