# Model Training Production Notebook

In [None]:
!pip install -U hopsworks --quiet

### Importing and connecting to Hopsworks

In [2]:
import hopsworks
import pandas as pd

project = hopsworks.login()
fs = project.get_feature_store()

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3342
Connected. Call `.close()` to terminate connection gracefully.




#### Creating a Feature View

In [3]:
feature_view_name="cc_trans_fraud"
def create_feature_view(feature_view_name, version):
    trans_fg = fs.get_feature_group(feature_view_name, version=version)
    window_aggs_fg = fs.get_feature_group(feature_view_name + '_4h', version=version)
    fraud_label_fg = fs.get_feature_group('transactions_fraud_label', version=version) 
    # Load transformation functions.
    min_max_scaler = fs.get_transformation_function(name="min_max_scaler")
    label_encoder = fs.get_transformation_function(name="label_encoder")

    # Map features to transformations.
    transformation_functions = {
        "category": label_encoder,
        "amount": min_max_scaler,
        "trans_volume_mavg": min_max_scaler,
        "trans_volume_mstd": min_max_scaler,
        "loc_delta_mavg": min_max_scaler,
        "trans_freq": min_max_scaler,
        "loc_delta_t_minus_1": min_max_scaler,
        "time_delta_t_minus_1": min_max_scaler,
        "age_at_transaction": min_max_scaler,
        "days_until_card_expires": min_max_scaler,
    }
    # Select features for training data.
    ds_query = fraud_label_fg.select(['tid', 'datetime','fraud_label'])\
        .join(trans_fg.select_except(["city", "country", "tid"]))\
        .join(window_aggs_fg.select_except(["cc_num", "datetime"]))
    #, prefix="ken") \
    #ds_query.show(5)    
    return fs.create_feature_view(
        name=feature_view_name,
        query=ds_query,
        labels=["fraud_label"],
        transformation_functions=transformation_functions
    )

In [4]:
try:
    feature_view = fs.get_feature_view(name=feature_view_name, version=1)
except:
    feature_view = create_feature_view(feature_view_name, 2)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/3342/fs/3289/fv/cc_trans_fraud/version/1


In [5]:
print(feature_view.query.to_string())

WITH right_fg0 AS (SELECT *
FROM (SELECT `fg2`.`tid` `tid`, `fg2`.`datetime` `datetime`, `fg2`.`fraud_label` `fraud_label`, `fg2`.`cc_num` `join_pk_cc_num`, `fg2`.`datetime` `join_evt_datetime`, `fg0`.`cc_num` `cc_num`, `fg0`.`category` `category`, `fg0`.`amount` `amount`, `fg0`.`age_at_transaction` `age_at_transaction`, `fg0`.`days_until_card_expires` `days_until_card_expires`, `fg0`.`loc_delta_t_minus_1` `loc_delta_t_minus_1`, `fg0`.`time_delta_t_minus_1` `time_delta_t_minus_1`, RANK() OVER (PARTITION BY `fg2`.`cc_num`, `fg2`.`datetime` ORDER BY `fg0`.`datetime` DESC) pit_rank_hopsworks
FROM `data_featurestore`.`transactions_fraud_label_2` `fg2`
INNER JOIN `data_featurestore`.`cc_trans_fraud_2` `fg0` ON `fg2`.`cc_num` = `fg0`.`cc_num` AND `fg2`.`datetime` >= `fg0`.`datetime`) NA
WHERE `pit_rank_hopsworks` = 1), right_fg1 AS (SELECT *
FROM (SELECT `fg2`.`tid` `tid`, `fg2`.`datetime` `datetime`, `fg2`.`fraud_label` `fraud_label`, `fg2`.`cc_num` `join_pk_cc_num`, `fg2`.`datetime` `join_

#### Creating Training Data(Splitting the original data into Training set and Test set)

In [6]:
td_version, td_job = feature_view.create_train_test_split(
    description = 'transactions fraud batch training dataset',
    data_format = "csv",
    test_size = 0.2,
    coalesce = True,
    statistics_config={
        "enabled": True,
        "histograms": False,
        "correlations": False
    }    
)

Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/3342/jobs/named/cc_trans_fraud_1_1_create_fv_td_25112022104001/executions




In [7]:
X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=td_version)

In [8]:
X_train = X_train.iloc[: , 3:]
X_test = X_test.iloc[: , 3:]
X_train

Unnamed: 0,category,amount,age_at_transaction,days_until_card_expires,loc_delta_t_minus_1,time_delta_t_minus_1,trans_volume_mstd,trans_volume_mavg,trans_freq,loc_delta_mavg
0,0,0.000440,0.515553,0.702739,0.003549,0.075166,0.006230,0.006040,0.006040,0.124262
1,0,0.002051,0.898096,0.973872,0.228808,0.045650,0.032012,0.031826,0.031826,0.218505
2,1,0.001235,0.320079,0.355725,0.246717,0.038225,0.005996,0.005806,0.005806,0.235239
3,2,0.002643,0.637510,0.959680,0.100995,0.057440,0.019635,0.019447,0.019447,0.083247
4,3,0.001922,0.345375,0.560612,0.152721,0.000947,0.003516,0.003325,0.003325,0.229926
...,...,...,...,...,...,...,...,...,...,...
132807,2,0.000925,0.577192,0.604891,0.211795,0.000621,0.014771,0.014582,0.014582,0.215288
132808,0,0.000695,0.069399,0.876981,0.020728,0.010423,0.011286,0.011097,0.011097,0.081132
132809,0,0.000381,0.462497,0.552936,0.000007,0.027479,0.029034,0.028848,0.028848,0.081822
132810,5,0.000715,0.799196,0.101194,0.045633,0.000059,0.002977,0.002786,0.002786,0.206333


In [9]:
y_test.value_counts()

fraud_label
0              33211
1                 61
dtype: int64

### Model Training

In [10]:
import xgboost as xgb

clf = xgb.XGBClassifier()

clf.fit(X_train,y_train)



XGBClassifier()

In [11]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

report_dict = classification_report(y_test, y_pred, output_dict=True)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     33211
           1       1.00      0.10      0.18        61

    accuracy                           1.00     33272
   macro avg       1.00      0.55      0.59     33272
weighted avg       1.00      1.00      1.00     33272



In [12]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[33211,     0],
       [   55,     6]])

In [13]:
print(report_dict['0'])

{'precision': 0.9983466602537125, 'recall': 1.0, 'f1-score': 0.9991726461783775, 'support': 33211}


### Register the model to the model registry

In [14]:
def register_python_model(model, name, description, features, labels, metrics):
    from hsml.schema import Schema
    from hsml.model_schema import ModelSchema
    import os
    import joblib
    import shutil

    mr = project.get_model_registry()

    model_dir= name + "_model"
    if os.path.isdir(model_dir) == False:
        os.mkdir(model_dir)
    pickle= name + '_model.pkl'
    # This will strip out the sml directory, copying only the files
    #shutil.copytree("sml", model_dir, dirs_exist_ok=True) #python 3.8+

    joblib.dump(model, model_dir + "/" + pickle)

    input_example = features.sample()
    input_schema = Schema(features)
    output_schema = Schema(labels)
    model_schema = ModelSchema(input_schema, output_schema)

    cc_fraud_model = mr.python.create_model(
        name="cc_fraud", 
        metrics=metrics,
        model_schema=model_schema,
        input_example=input_example, 
        description=description)

    # Save all artifacts in the model directory to the model registry
    cc_fraud_model.save(model_dir)


register_python_model(clf, "cc_fraud", "Credit Card Fraud Predictor", X_train, y_train, report_dict['0'])

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/3342/models/cc_fraud/1
