# Remark:
This code needs to be runned once

In [1]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from arize.pandas.logger import Client, Schema
import datetime as dt
from arize.utils.types import ModelTypes, Environments
warnings.filterwarnings("ignore")

In [2]:
version = "v2.0"
data_url = "../data/fraud2.csv"

In [3]:
import sys  
sys.path.insert(0, '../backend/src')

import data_preprocessing_training
import clean_data_json

In [11]:
from data_preprocessing_monitoring import transform_data
from clean_data_json import clean_data_json

In [5]:
from dotenv import load_dotenv
import os
load_dotenv("../backend/src/.env")

DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token")

In [6]:
import os
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token

In [7]:
#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/.../....mlflow') #your mlfow tracking uri
mlflow.set_experiment("fraud-detector-experiment")

<Experiment: artifact_location='mlflow-artifacts:/b26c3925d9344fc1958b2572d31493fa', creation_time=1716970855752, experiment_id='0', last_update_time=1716970855752, lifecycle_stage='active', name='fraud-detector-experiment', tags={}>

In [8]:
#read the data
raw_train = pd.read_csv(data_url)

In [12]:
raw_train.head(3)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0


In [13]:
#cleaning and preprocessing
X,y = transform_data(raw_train)

In [15]:
#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.xgboost
  run_id: e14ad66e453643a780dee296c0ae421c



## Transform the training data before sending it to Arize AI :

In [16]:
selected_cols = ['category','amt', 'gender', 'zip', 'lat', 'long', 'dob', 'merch_lat', 'merch_long',"trans_date_trans_time" , 'is_fraud']

In [17]:
baseline = raw_train[selected_cols]

In [20]:
baseline["trans_date_trans_time"] = pd.to_datetime(baseline["trans_date_trans_time"])

In [21]:

# Extract age of card holder column
baseline['age'] = dt.date.today().year - pd.to_datetime(baseline['dob']).dt.year
# drop unusefull columns
baseline.drop(["dob"], axis=1, inplace=True)

In [22]:

# create new columns day,month,year
baseline["year"] = baseline["trans_date_trans_time"].dt.year
baseline["month"] = baseline["trans_date_trans_time"].dt.month
baseline["day"] = baseline["trans_date_trans_time"].dt.day
# Extract hour,minute and second
baseline["hour"] = baseline["trans_date_trans_time"].dt.hour
baseline["month"] = baseline["trans_date_trans_time"].dt.month
baseline["sec"] = baseline["trans_date_trans_time"].dt.second

In [23]:
baseline.drop(["trans_date_trans_time"], axis=1, inplace=True)

In [24]:
baseline.rename(columns = {'is_fraud':'actual_label'}, inplace = True)

In [32]:
transform_bin_str = { 0 : 'non_fraud', 1 : 'fraud'}
baseline['actual_label'] = baseline['actual_label'].map(transform_bin_str)

In [18]:
preds = loaded_model.predict(X)

In [26]:
baseline['prediction_label'] = preds

In [33]:
baseline['prediction_label'] = baseline['prediction_label'].map(transform_bin_str)

In [27]:
import uuid
# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

In [28]:
baseline["prediction_id"]=generate_prediction_ids(baseline)

In [34]:
baseline.head(3)

Unnamed: 0,category,amt,gender,zip,lat,long,merch_lat,merch_long,actual_label,age,year,month,day,hour,sec,prediction_label,prediction_id
0,misc_net,4.97,F,28654,36.0788,-81.1781,36.011293,-82.048315,non_fraud,36,2019,1,1,0,18,non_fraud,b638c4f4-d612-43c2-9b94-6c8af634c44c
1,grocery_pos,107.23,F,99160,48.8878,-118.2105,49.159047,-118.186462,non_fraud,46,2019,1,1,0,44,non_fraud,cc50ea0f-d36e-43cd-9d7f-abd4f957a648
2,entertainment,220.11,M,83252,42.1808,-112.262,43.150704,-112.154481,non_fraud,62,2019,1,1,0,51,non_fraud,37cd7fd3-67f3-4157-aa8a-b6668060f932


## Setup Arize AI :

In [35]:
SPACE_KEY = "..."
API_KEY = "..."

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "fraud-detector-model"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


## Send train data to Arize AI :
the training data will be the reference data later in production

In [36]:
features = feature_column_names=list(baseline.columns.drop(
        ["prediction_id", "prediction_label", "actual_label"]))

In [37]:
# Define a Schema() object for Arize to pick up data from the correct columns for logging
training_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction_label",
    actual_label_column_name="actual_label",
    feature_column_names=features)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=baseline,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjo3MjgyOktFcHc=/spaces/U3BhY2U6NzY1Njp5eHY1/models/modelName/fraud-detector-model?selectedTab=dataIngestion[0m
✅ You have successfully logged training set to Arize
