In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
import mlflow

# set tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

#create a new MLflow Experiment
mlflow.set_experiment("MLflow for Fraud detection")

<Experiment: artifact_location='mlflow-artifacts:/668097690514392710', creation_time=1719263205613, experiment_id='668097690514392710', last_update_time=1719263205613, lifecycle_stage='active', name='MLflow for Fraud detection', tags={}>

In [4]:
# Load the datasets
fraud_df = pd.read_csv('../data/old/Fraud_Data.csv')
fraud_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [5]:
import category_encoders as ce
encoder= ce.OrdinalEncoder(cols=['source'],return_df=True,
                           mapping=[{'col':'source',
'mapping':{'None':0,'SEO':1,'Ads':2,'Direct':3}}])

In [6]:
fraud_df = encoder.fit_transform(fraud_df)

In [7]:
encoder= ce.OrdinalEncoder(cols=['browser'],return_df=True,
                           mapping=[{'col':'browser',
'mapping':{'None':0,'Chrome':1,'IE':2,'Safari':3,'FireFox':4,'Opera':5}}])

In [8]:
fraud_df = encoder.fit_transform(fraud_df)

In [9]:
encoder= ce.OrdinalEncoder(cols=['sex'],return_df=True,
                           mapping=[{'col':'sex',
'mapping':{'F':0,'M':1}}])

In [10]:
fraud_df = encoder.fit_transform(fraud_df)

In [11]:
fraud_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,1,1,1,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,2,1,0,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,1,5,1,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,1,3,1,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,2,3,1,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,1,1,1,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,1,3,1,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,1,2,0,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,1,1,1,37,3.601175e+09,0


In [13]:
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

fraud_df['signup_hour'] = fraud_df['signup_time'].apply(lambda x: x.hour)
fraud_df['signup_day'] = fraud_df['signup_time'].apply(lambda x: x.day)
fraud_df['purchase_hour'] = fraud_df['purchase_time'].apply(lambda x: x.hour)
fraud_df['purchase_day'] = fraud_df['purchase_time'].apply(lambda x: x.day)

fraud_df = fraud_df.drop(columns=['signup_time', 'purchase_time', 'device_id'])


In [130]:
fraud_df

Unnamed: 0,user_id,purchase_value,source,browser,sex,age,ip_address,class,hour_of_day,day_of_week,transaction_frequency,velocity,signup_hour,signup_day,purchase_hour,purchase_day
0,22058,34,1,1,1,39,732758368,0,2,18,0.055556,0.003086,22,24,2,18
1,333320,16,2,1,0,53,350311387,0,1,8,0.125000,0.015625,20,7,1,8
2,1359,15,1,5,1,53,2621473820,1,18,1,1.000000,1.000000,18,1,18,1
3,150084,44,1,3,1,41,3840542443,0,13,4,0.250000,0.062500,21,28,13,4
4,221365,39,2,3,1,45,415583117,0,18,9,0.111111,0.012346,7,21,18,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,43,1,1,1,28,3451154526,1,0,29,0.034483,0.001189,3,27,0,29
151108,274471,35,1,3,1,32,2439047221,0,12,26,0.038462,0.001479,17,15,12,26
151109,368416,40,1,2,0,26,2748470523,0,7,20,0.050000,0.002500,23,3,7,20
151110,207709,46,1,1,1,37,3601174708,0,9,7,0.142857,0.020408,20,9,9,7


In [14]:
from sklearn.model_selection import train_test_split
fraud_x = fraud_df.drop(columns=['class'])
fraud_y = fraud_df['class']
fraud_x_train, fraud_x_test, fraud_y_train, fraud_y_test = train_test_split(fraud_x, fraud_y, test_size=0.3, random_state=42)

In [15]:
# Define preprocessing for numerical and categorical features
numeric_features = ['purchase_value', 'age']
categorical_features = ['source', 'browser', 'sex', 'signup_hour', 'signup_day', 'purchase_hour', 'purchase_day']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [16]:

# Model Selection
models = {
    'LogisticRegression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier()
}

# Function to train and evaluate models for fraud data
def train_evaluate_model_fraud(model_name, model, x_train, x_test, y_train, y_test):
    # Create a pipeline with preprocessing and model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), # Change the variable name to model_pipeline
                               ('classifier', model)])
    # Train the model
    model_pipeline.fit(x_train, y_train)

    # Make predictions
    y_pred = model_pipeline.predict(x_test) # Use the new variable name here as well

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Infer model signature
    signature = mlflow.models.infer_signature(x_train, y_pred) # Call infer_signature from mlflow
    #signature = infer_signature(x_train, y_pred)
    # log metrics and model with MLflow
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.set_tag("Training Info", "All models for Fraud detection")
        model_info = mlflow.sklearn.log_model(
            sk_model=model_pipeline, # And here
            artifact_path="fraud_model",
            signature=signature,
            input_example=x_train,
            registered_model_name=f"{model_name}_fraud_detection",
        )
    return{
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

In [17]:
fraud_x_train, fraud_x_test, fraud_y_train, fraud_y_test = train_test_split(fraud_x, fraud_y, test_size=0.3, random_state=42)
# Train and evaluate models on fraud data

fraud_results = []
for model_name, model in models.items():
    fraud_results.append(train_evaluate_model_fraud(model_name, model, fraud_x_train, fraud_x_test, fraud_y_train, fraud_y_test))
# Display results
fraud_results_df = pd.DataFrame(fraud_results)
print("Fraud Data Results:\n", fraud_results_df)

  _warn_prf(average, modifier, msg_start, len(result))
Successfully registered model 'LogisticRegression_fraud_detection'.
2024/06/27 19:08:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_fraud_detection, version 1
Created version '1' of model 'LogisticRegression_fraud_detection'.
Successfully registered model 'Decision Tree_fraud_detection'.
2024/06/27 19:08:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Decision Tree_fraud_detection, version 1
Created version '1' of model 'Decision Tree_fraud_detection'.
Successfully registered model 'Random Forest_fraud_detection'.
2024/06/27 19:13:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest_fraud_detection, version 1
Created version '1' of model 'Random Forest_fraud_detection

Fraud Data Results:
                 model  accuracy  precision    recall  f1_score
0  LogisticRegression  0.906979   0.000000  0.000000  0.000000
1       Decision Tree  0.913619   0.533889  0.562248  0.547702
2       Random Forest  0.955089   0.954186  0.543277  0.692354
3   Gradient Boosting  0.945891   0.817266  0.538772  0.649421
4                 MLP  0.937111   0.705971  0.555134  0.621532


Created version '1' of model 'MLP_fraud_detection'.


In [19]:
# save merged dataframe
import sys, os
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
file_handler = FileHandler()
file_handler.to_csv(fraud_df, '../data/raw/Fraud_Data.csv')