In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import catboost as ctb
import lightgbm as lgb

import mlflow
import mlflow.sklearn
import mlflow.xgboost

import os

pd.pandas.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
remote_server_uri = 'https://dagshub.com/adam.zabek/Credit_Card_Fraud_Detection.mlflow' ### insert url to remote server
mlflow.set_tracking_uri(remote_server_uri)

In [3]:
os.environ['MLFLOW_TRACKING_USERNAME'] = 'adam.zabek' ### insert name
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'e3c5d562249c76008f52ba8e9f3e0c7b416512db' ### insert password

In [4]:
mlflow.set_experiment("credit_card_fraud_detection")

<Experiment: artifact_location='mlflow-artifacts:/02432f41fc9a40cca9546662b2719817', creation_time=1696352191511, experiment_id='0', last_update_time=1696352191511, lifecycle_stage='active', name='credit_card_fraud_detection', tags={}>

In [5]:
df = pd.read_csv('./df_selected.csv')

In [6]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-0.701082,-0.041687,1.680101,0.976623,-0.24702,0.348012,0.1937,0.084434,0.333534,0.085688,-0.541662,-0.620391,-0.99655,-0.32705,1.603615,-0.539734,0.246646,0.02899,0.497011,0.326273,-0.024777,0.383483,-0.177444,0.110157,0.247059,-0.392622,0.333033,-0.06585,0
1,0.608792,0.164138,0.109279,0.318998,0.042258,-0.06098,-0.065656,0.072903,-0.231703,-0.153784,1.582896,1.071659,0.490735,-0.151252,0.693541,0.529653,-0.136468,-0.220781,-0.178907,-0.089963,-0.311372,-0.881454,0.162081,-0.561503,0.321175,0.260854,-0.027154,0.043219,0
2,-0.700336,-0.811337,1.17427,0.270648,-0.366756,1.352655,0.643223,0.210788,-1.381169,0.194242,0.612829,0.067156,0.719981,-0.174539,2.56295,-3.309216,1.31726,-0.146738,-2.780497,0.681564,0.343094,1.065068,1.457772,-1.138484,-0.628161,-0.288861,-0.144325,-0.183824,0
3,-0.499064,-0.109972,1.187383,-0.608355,-0.008814,0.937245,0.192079,0.320843,-1.264664,-0.049713,-0.222524,0.179901,0.509483,-0.302638,-0.691296,-1.214165,-0.812176,2.345732,-1.515112,-0.270428,-0.149093,0.007299,-0.305465,-1.941446,1.242487,-0.460694,0.154039,0.185687,0
4,-0.597606,0.535539,1.02547,0.287092,-0.297036,0.072873,0.481517,-0.228725,0.747917,0.700958,-0.807922,0.541797,1.351427,-1.176125,0.190272,-0.518043,-0.281545,-0.047422,0.988165,0.530343,-0.012516,1.10178,-0.220709,0.232904,-0.3948,1.041677,0.550001,0.654234,0


In [7]:
df['Class'].value_counts()

0    283253
1       473
Name: Class, dtype: int64

In [8]:
black_list = ['Class']

In [9]:
def get_feats(df, black_list):
    feats = df.columns
    return [x for x in feats if x not in black_list]

In [10]:
def get_X(df, feats):
    
    X = df[feats]
    return X

def get_y(df, target_var):
    return df[target_var].values

## Undersampling - random selection Non-Fraud observation to train model in ratio 1:4 Fraud:Non-Fraud.

In [11]:
def select_sample(df, target, class_name, number_of_sample):
    
    select = df[df[target] == class_name]
    selected_sample = select.sample(n = number_of_sample, replace = False)
    return selected_sample

In [12]:
non_fraud = select_sample(df, 'Class', 0, 1892)

In [13]:
fraud = select_sample(df, 'Class', 1, 473)

In [14]:
selected_sample = pd.concat([fraud, non_fraud])

In [15]:
print(selected_sample['Class'].value_counts()/len(selected_sample))

0    0.8
1    0.2
Name: Class, dtype: float64


## Take feats to train model.

In [16]:
feats = get_feats(selected_sample, black_list)

In [17]:
X = get_X(selected_sample, feats)

In [18]:
y = get_y(selected_sample, target_var = 'Class')

## Split data into train/test dataset.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .8, shuffle = True, stratify = y)

## Train model

In [20]:
model_param = [
    {
        'model_type': 'DecisionTree',
        'params': {
            'criterion': 'gini',
            'max_depth': 10,
            'random_state': 0
        }
    },
    {
        'model_type': 'RandomForest',
        'params': {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 0
        }
    },
    {
        'model_type': 'CatBoost',
        'params': {
            #'iterations': 2000,
            'depth': 10,
            'learning_rate': 0.01,
            'n_estimators': 1000,
            'loss_function': 'Logloss',
            'early_stopping_rounds': 50,
            'random_state': 0
        }
    },
    {
        'model_type': 'LightGBM',
        'params': {
            'boosting_type': 'dart',
            'num_leaves': 20,
            'max_depth': 10,
            'n_estimators': 100,
            'learning_rate': 0.01,
            'subsample': 0.9,
            'colsample_bytree': 0.8,
            'random_state': 0
        }
    },
    {
        'model_type': 'XGBoost',
        'params': {
            'max_depth': 10,
            'learning_rate': 0.01,
            'n_estimators': 100,
            'subsample': 0.9,
            'colsample_bytree': 0.8,
            'random_state': 0
        }
    }
]

In [21]:
def train_models(models, model_param, X_train, X_test, y_train, y_test):
    mlflow.sklearn.autolog(disable=True)
    
    labels = ['Non Fraud', 'Fraud']
    
    for model_name in models:
        model_params  = next((params for params in model_param if params['model_type'] == model_name), None)
        
        if model_params is None:
            print(f"Model '{model_name}' not found in the parameters list.")
            continue
        
        with mlflow.start_run(run_name=f'{model_name}_Model'):
            if model_name == 'DecisionTree':
                
                mlflow.set_tag("model_name", "DT")
                mlflow.log_params(model_params)
            
                model = DecisionTreeClassifier(**model_params['params'])
                model.fit(X_train, y_train)
            
                y_pred = model.predict(X_test)
            
                auc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                rec = recall_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                
                print(classification_report(y_test, y_pred, target_names=labels))
                
                mlflow.log_metric("test_auc", auc)
                mlflow.log_metric("test_f1", f1)
                mlflow.log_metric("test_rec", rec)
                mlflow.log_metric("test_prec", prec)

                mlflow.sklearn.log_model(model, f"{model_name}_Model")
            
            elif model_name == 'RandomForest':

                mlflow.set_tag("model_name", "RF")
                mlflow.log_params(model_params)
                        
                model = RandomForestClassifier(**model_params['params'])
                model.fit(X_train, y_train)
            
                y_pred = model.predict(X_test)
            
                auc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                rec = recall_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                
                print(classification_report(y_test, y_pred, target_names=labels))

                mlflow.log_metric("test_auc", auc)
                mlflow.log_metric("test_f1", f1)
                mlflow.log_metric("test_rec", rec)
                mlflow.log_metric("test_prec", prec)

                mlflow.sklearn.log_model(model, f"{model_name}_Model")
            
            elif model_name == 'CatBoost':

                mlflow.set_tag("model_name", "CB")
                mlflow.log_params(model_params)
                                    
                model = ctb.CatBoostClassifier(**model_params['params'])
                model.fit(X_train, y_train)
            
                y_pred = model.predict(X_test)
            
                auc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                rec = recall_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                
                print(classification_report(y_test, y_pred, target_names=labels))
                
                mlflow.log_metric("test_auc", auc)
                mlflow.log_metric("test_f1", f1)
                mlflow.log_metric("test_rec", rec)
                mlflow.log_metric("test_prec", prec)

                mlflow.sklearn.log_model(model, f"{model_name}_Model")
            
            elif model_name == 'LightGBM':

                mlflow.set_tag("model_name", "LGB")
                mlflow.log_params(model_params)
                                    
                model = lgb.LGBMClassifier(**model_params['params'])
                model.fit(X_train, y_train)
            
                y_pred = model.predict(X_test)
            
                auc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                rec = recall_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                
                print(classification_report(y_test, y_pred, target_names=labels))
                
                mlflow.log_metric("test_auc", auc)
                mlflow.log_metric("test_f1", f1)
                mlflow.log_metric("test_rec", rec)
                mlflow.log_metric("test_prec", prec)

                mlflow.sklearn.log_model(model, f"{model_name}_Model")
            
            elif model_name == 'XGBoost':

                mlflow.set_tag("model_name", "XGB")
                mlflow.log_params(model_params)
                                    
                model = xgb.XGBClassifier(**model_params['params'])
                model.fit(X_train, y_train)
            
                y_pred = model.predict(X_test)
            
                auc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                rec = recall_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                
                print(classification_report(y_test, y_pred, target_names=labels))
                
                mlflow.log_metric("test_auc", auc)
                mlflow.log_metric("test_f1", f1)
                mlflow.log_metric("test_rec", rec)
                mlflow.log_metric("test_prec", prec)

                mlflow.sklearn.log_model(model, f"{model_name}_Model")

In [23]:
models = ['DecisionTree','RandomForest','CatBoost','LightGBM','XGBoost']
train_models(models, model_param, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

   Non Fraud       0.96      0.97      0.97       378
       Fraud       0.89      0.85      0.87        95

    accuracy                           0.95       473
   macro avg       0.93      0.91      0.92       473
weighted avg       0.95      0.95      0.95       473

              precision    recall  f1-score   support

   Non Fraud       0.96      1.00      0.98       378
       Fraud       0.99      0.82      0.90        95

    accuracy                           0.96       473
   macro avg       0.97      0.91      0.94       473
weighted avg       0.96      0.96      0.96       473

0:	learn: 0.6762895	total: 58.3ms	remaining: 58.3s
1:	learn: 0.6602934	total: 122ms	remaining: 1m
2:	learn: 0.6442081	total: 188ms	remaining: 1m 2s
3:	learn: 0.6282370	total: 252ms	remaining: 1m 2s
4:	learn: 0.6138506	total: 310ms	remaining: 1m 1s
5:	learn: 0.5984205	total: 370ms	remaining: 1m 1s
6:	learn: 0.5844199	total: 437ms	remaining: 1m 2

151:	learn: 0.0800843	total: 10.4s	remaining: 57.9s
152:	learn: 0.0796053	total: 10.4s	remaining: 57.9s
153:	learn: 0.0790554	total: 10.5s	remaining: 57.8s
154:	learn: 0.0786343	total: 10.6s	remaining: 57.8s
155:	learn: 0.0781130	total: 10.7s	remaining: 57.8s
156:	learn: 0.0775221	total: 10.7s	remaining: 57.7s
157:	learn: 0.0768695	total: 10.8s	remaining: 57.7s
158:	learn: 0.0764055	total: 10.9s	remaining: 57.6s
159:	learn: 0.0759602	total: 11s	remaining: 57.6s
160:	learn: 0.0754766	total: 11s	remaining: 57.6s
161:	learn: 0.0749875	total: 11.1s	remaining: 57.5s
162:	learn: 0.0744593	total: 11.2s	remaining: 57.5s
163:	learn: 0.0739145	total: 11.3s	remaining: 57.5s
164:	learn: 0.0733654	total: 11.4s	remaining: 57.6s
165:	learn: 0.0728132	total: 11.4s	remaining: 57.5s
166:	learn: 0.0723728	total: 11.5s	remaining: 57.5s
167:	learn: 0.0719613	total: 11.6s	remaining: 57.4s
168:	learn: 0.0715677	total: 11.7s	remaining: 57.4s
169:	learn: 0.0711173	total: 11.7s	remaining: 57.3s
170:	learn: 0.07

310:	learn: 0.0350250	total: 22.9s	remaining: 50.8s
311:	learn: 0.0348564	total: 23s	remaining: 50.8s
312:	learn: 0.0347148	total: 23.1s	remaining: 50.8s
313:	learn: 0.0345718	total: 23.2s	remaining: 50.7s
314:	learn: 0.0344275	total: 23.3s	remaining: 50.7s
315:	learn: 0.0343184	total: 23.4s	remaining: 50.6s
316:	learn: 0.0341854	total: 23.5s	remaining: 50.6s
317:	learn: 0.0340481	total: 23.5s	remaining: 50.5s
318:	learn: 0.0339104	total: 23.6s	remaining: 50.4s
319:	learn: 0.0337516	total: 23.7s	remaining: 50.4s
320:	learn: 0.0336103	total: 23.8s	remaining: 50.3s
321:	learn: 0.0334601	total: 23.9s	remaining: 50.3s
322:	learn: 0.0333685	total: 24s	remaining: 50.2s
323:	learn: 0.0332136	total: 24.1s	remaining: 50.2s
324:	learn: 0.0330793	total: 24.2s	remaining: 50.2s
325:	learn: 0.0329670	total: 24.3s	remaining: 50.2s
326:	learn: 0.0328340	total: 24.4s	remaining: 50.1s
327:	learn: 0.0327260	total: 24.5s	remaining: 50.1s
328:	learn: 0.0326102	total: 24.5s	remaining: 50.1s
329:	learn: 0.03

471:	learn: 0.0203009	total: 35.4s	remaining: 39.6s
472:	learn: 0.0202421	total: 35.4s	remaining: 39.5s
473:	learn: 0.0201759	total: 35.5s	remaining: 39.4s
474:	learn: 0.0201207	total: 35.6s	remaining: 39.4s
475:	learn: 0.0200613	total: 35.7s	remaining: 39.3s
476:	learn: 0.0200018	total: 35.8s	remaining: 39.2s
477:	learn: 0.0199360	total: 35.8s	remaining: 39.1s
478:	learn: 0.0198698	total: 35.9s	remaining: 39.1s
479:	learn: 0.0198025	total: 36s	remaining: 39s
480:	learn: 0.0197458	total: 36.1s	remaining: 38.9s
481:	learn: 0.0196929	total: 36.1s	remaining: 38.8s
482:	learn: 0.0196386	total: 36.2s	remaining: 38.8s
483:	learn: 0.0195855	total: 36.3s	remaining: 38.7s
484:	learn: 0.0195201	total: 36.4s	remaining: 38.6s
485:	learn: 0.0194627	total: 36.4s	remaining: 38.5s
486:	learn: 0.0194131	total: 36.5s	remaining: 38.5s
487:	learn: 0.0193690	total: 36.6s	remaining: 38.4s
488:	learn: 0.0193281	total: 36.7s	remaining: 38.3s
489:	learn: 0.0192691	total: 36.7s	remaining: 38.2s
490:	learn: 0.01

631:	learn: 0.0134035	total: 47.7s	remaining: 27.8s
632:	learn: 0.0133723	total: 47.8s	remaining: 27.7s
633:	learn: 0.0133406	total: 47.9s	remaining: 27.6s
634:	learn: 0.0133105	total: 47.9s	remaining: 27.6s
635:	learn: 0.0132785	total: 48s	remaining: 27.5s
636:	learn: 0.0132454	total: 48.1s	remaining: 27.4s
637:	learn: 0.0132090	total: 48.2s	remaining: 27.3s
638:	learn: 0.0131836	total: 48.2s	remaining: 27.2s
639:	learn: 0.0131463	total: 48.3s	remaining: 27.2s
640:	learn: 0.0131216	total: 48.4s	remaining: 27.1s
641:	learn: 0.0130948	total: 48.4s	remaining: 27s
642:	learn: 0.0130652	total: 48.5s	remaining: 26.9s
643:	learn: 0.0130429	total: 48.6s	remaining: 26.9s
644:	learn: 0.0130135	total: 48.7s	remaining: 26.8s
645:	learn: 0.0129839	total: 48.8s	remaining: 26.7s
646:	learn: 0.0129572	total: 48.8s	remaining: 26.6s
647:	learn: 0.0129276	total: 48.9s	remaining: 26.6s
648:	learn: 0.0128882	total: 49s	remaining: 26.5s
649:	learn: 0.0128638	total: 49s	remaining: 26.4s
650:	learn: 0.012839

790:	learn: 0.0096186	total: 59.8s	remaining: 15.8s
791:	learn: 0.0096042	total: 59.9s	remaining: 15.7s
792:	learn: 0.0095887	total: 60s	remaining: 15.6s
793:	learn: 0.0095692	total: 1m	remaining: 15.6s
794:	learn: 0.0095495	total: 1m	remaining: 15.5s
795:	learn: 0.0095377	total: 1m	remaining: 15.4s
796:	learn: 0.0095228	total: 1m	remaining: 15.3s
797:	learn: 0.0094987	total: 1m	remaining: 15.3s
798:	learn: 0.0094824	total: 1m	remaining: 15.2s
799:	learn: 0.0094605	total: 1m	remaining: 15.1s
800:	learn: 0.0094427	total: 1m	remaining: 15s
801:	learn: 0.0094207	total: 1m	remaining: 15s
802:	learn: 0.0094006	total: 1m	remaining: 14.9s
803:	learn: 0.0093832	total: 1m	remaining: 14.8s
804:	learn: 0.0093640	total: 1m	remaining: 14.7s
805:	learn: 0.0093488	total: 1m	remaining: 14.7s
806:	learn: 0.0093312	total: 1m 1s	remaining: 14.6s
807:	learn: 0.0093135	total: 1m 1s	remaining: 14.5s
808:	learn: 0.0092920	total: 1m 1s	remaining: 14.4s
809:	learn: 0.0092693	total: 1m 1s	remaining: 14.4s
810:	

949:	learn: 0.0073385	total: 1m 11s	remaining: 3.77s
950:	learn: 0.0073280	total: 1m 11s	remaining: 3.7s
951:	learn: 0.0073159	total: 1m 11s	remaining: 3.62s
952:	learn: 0.0073009	total: 1m 11s	remaining: 3.55s
953:	learn: 0.0072887	total: 1m 12s	remaining: 3.47s
954:	learn: 0.0072767	total: 1m 12s	remaining: 3.4s
955:	learn: 0.0072651	total: 1m 12s	remaining: 3.32s
956:	learn: 0.0072537	total: 1m 12s	remaining: 3.25s
957:	learn: 0.0072414	total: 1m 12s	remaining: 3.17s
958:	learn: 0.0072296	total: 1m 12s	remaining: 3.09s
959:	learn: 0.0072193	total: 1m 12s	remaining: 3.02s
960:	learn: 0.0072046	total: 1m 12s	remaining: 2.94s
961:	learn: 0.0071896	total: 1m 12s	remaining: 2.87s
962:	learn: 0.0071808	total: 1m 12s	remaining: 2.79s
963:	learn: 0.0071690	total: 1m 12s	remaining: 2.72s
964:	learn: 0.0071595	total: 1m 12s	remaining: 2.64s
965:	learn: 0.0071501	total: 1m 12s	remaining: 2.56s
966:	learn: 0.0071393	total: 1m 12s	remaining: 2.49s
967:	learn: 0.0071306	total: 1m 13s	remaining: 2