In [3]:
import pandas as pd
from pathlib import Path
from datetime import datetime

import numpy as np
import math
import random
import matplotlib.pyplot as plt


current_date = datetime.now()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
'''
IMPORT DATASET
'''
path='/content/drive/MyDrive/dash_competition'
Train_df = pd.read_csv(path+'/fraudTest.csv')
Test_df = pd.read_csv(path+'/fraudTrain.csv')

In [5]:
'''
COPY OF THE DATASET
'''
rawTrain_df = Train_df.copy()
rawTest_df = Test_df.copy()

In [6]:

'''
DATASET CLEANER
'''

def df_cleaner(df : pd.DataFrame,name:str):
       if not isinstance(df, pd.DataFrame):
              raise ValueError('Error: File must be pandas DataFrames.') # Returns error if dataset is not pandas
       try:
              df['dob'] = pd.to_datetime(df['dob'])
              df['age_in_days'] = (current_date - df['dob']).dt.days
              df['age'] = df['age_in_days'] / 365
              df['age'] = df['age'].apply(math.floor)
              df['age'] = df['age'].astype(int) #age of person

              df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) #transform to date
              df['hour'] = df['trans_date_trans_time'].dt.hour #gets hour


              df.drop(columns=['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'zip',
                     'lat', 'long', 'job','trans_num', 'unix_time', 'merch_lat', 'merch_long','dob',
                     'age_in_days','trans_date_trans_time'],inplace=True) # removing not generalized

              df.rename(columns={'amt' : 'amount','city_pop':'city_population'}, inplace=True)

              df.to_csv(f'{name}.csv')


       except (TypeError, ValueError) as err:
        # Handle specific error types for better debugging
              print(f"Error cleaning DataFrame: {str(err)}")  # More descriptive error message

       return df

In [7]:
'''
KEEPING CLEANED DATASET
'''
train_df = df_cleaner(rawTrain_df,'Train_df')
test_df = df_cleaner(rawTest_df,'Test_df')

In [10]:
'''
THE PROBLEM OF AN UNBALANCED DATASET.
APPLYING ENCODING AND UNDERSAMPLING.
ENCODING IS USED TO ADDRESS THE ISSUE OF DUMMY COLUMNS.

REGARDING UNDERSAMPLING, I MADE THAT DECISION AFTER DETERMINING THAT
OVERSAMPLING WAS NOT A VIABLE OPTION BECAUSE I DID NOT WANT TO INTRODUCE FICTITIOUS DATA.
WHEN IT CAME TO BALANCING USING WEIGHTS, THE RESULTS WERE NOT AS EFFECTIVE AS UNDERSAMPLING.

IN THIS INSTANCE, THE NUMBER OF TRUE DATA POINTS IN THE FRAUD TABLE WAS SATISFACTORY,
SO I OPTED TO MAINTAIN THE SAME QUANTITY FOR FALSE DATA, TAKING RANDOM ROWS.
'''

df = pd.concat([train_df, test_df], axis=0)

encoder = OrdinalEncoder()
df['category'] = df['category'].astype('category')
df['gender'] = df['gender'].astype('category')
df['state'] = df['state'].astype('category')
df['hour'] = df['hour'].astype('category')

df['category'] = encoder.fit_transform(df['category'].values.reshape(-1, 1))
df['gender'] = encoder.fit_transform(df['gender'].values.reshape(-1, 1))
df['state'] = encoder.fit_transform(df['state'].values.reshape(-1, 1))
df['hour'] = encoder.fit_transform(df['hour'].values.reshape(-1, 1))

df_fraud = df[df['is_fraud'] == 1]
df_nofraud1 = df[df['is_fraud'] == 0]

# I searched for the quantity of true data points and then randomly selected the same number of rows for the false data.
muestra_aleatoria = random.sample(list(range(len(df_nofraud1))), 9651)

# select rows of random sample
df_nofraud = df_nofraud1.iloc[muestra_aleatoria]

df_undersampled = pd.concat([df_fraud, df_nofraud], axis=0) 

In [11]:
'''
APPLYING STANDARDSCALER
'''

X = df_undersampled.drop('is_fraud', axis=1)  # Features
y = df_undersampled['is_fraud']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
# Testing many models

clfs = [LogisticRegression(),
            Perceptron(),
            SVC(),
            AdaBoostClassifier(),
            GradientBoostingClassifier(),
            XGBClassifier(),
            XGBRFClassifier(),
            RandomForestClassifier(),
            KNeighborsClassifier()
            ]
names = ['Logistic Regression',
             'Perceptron',
             'SVC(Support Vector Classifier)',
             'Ada Boost Classifier',
             'Gradient Boosting Classifier',
             'XGB Classifier',
             'XGBRF Classifier',
             'Random Forest Classifier',
             'K Neighbors Classifier'
             ]

ranking_dic={}
modelos_dic={}


for clf,name in zip(clfs,names):

    print(name)

    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    conf_matrix = confusion_matrix(y_test, y_pred)
    clas_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    ranking_dic[f'{name}'] = accuracy

    modelos_dic[f'{name}'] = {'conf_matrix' : conf_matrix}
    modelos_dic[f'{name}'].update({'accuracy' : accuracy})


    print(f'{clas_report}\n\n Confusion Matrix \n {conf_matrix} \n\n Balanced Accuracy Score: {accuracy}')
    print(f'--------------------------------------------------------------------------------------------------')

Logistic Regression
              precision    recall  f1-score   support

           0       0.79      0.94      0.86      1931
           1       0.93      0.75      0.83      1930

    accuracy                           0.85      3861
   macro avg       0.86      0.85      0.85      3861
weighted avg       0.86      0.85      0.85      3861


 Confusion Matrix 
 [[1820  111]
 [ 475 1455]] 

 Balanced Accuracy Score: 0.8482258482258482
--------------------------------------------------------------------------------------------------
Perceptron
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      1931
           1       0.70      0.73      0.71      1930

    accuracy                           0.71      3861
   macro avg       0.71      0.71      0.71      3861
weighted avg       0.71      0.71      0.71      3861


 Confusion Matrix 
 [[1317  614]
 [ 520 1410]] 

 Balanced Accuracy Score: 0.7062937062937062
--------------------------

In [15]:
ranking_dic = sorted(ranking_dic.items(), key=lambda x: x[1], reverse=True) # making the ranking

In [16]:
ranking_dic

[('XGB Classifier', 0.9787619787619788),
 ('Random Forest Classifier', 0.9764309764309764),
 ('Gradient Boosting Classifier', 0.9658119658119658),
 ('XGBRF Classifier', 0.957005957005957),
 ('Ada Boost Classifier', 0.9399119399119399),
 ('K Neighbors Classifier', 0.9127169127169127),
 ('SVC(Support Vector Classifier)', 0.8852628852628852),
 ('Logistic Regression', 0.8482258482258482),
 ('Perceptron', 0.7062937062937062)]

In [17]:
'''
HERE IM TESTING CROSS VALIDATION, JUST TO HAVE MORE ROBUST RESULTS
'''
# Define the number of folds for cross-validation

FOLDS = 5
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=10)

x_train2 = np.array(X_train_scaled)  # Using data of escaled trainings
y_train2 = np.array(y_train)

defec_accu = []

# Define the number of folds for cross-validation
ranking_dic_avg={}

# Iterate over each classifier and perform cross-validation
for clfi, name in zip(clfs, names):

    print(f'{name} :\n')
    avg_accuracy = 0

    for fold, (train_idx, val_idx) in enumerate(cv.split(x_train2, y_train2)):

        xi, yi = x_train2[train_idx], y_train2[train_idx]
        x_valid, y_valid = x_train2[val_idx], y_train2[val_idx]
        clfi.fit(xi, yi)
        test_predictions = clfi.predict(x_valid)

        balanced_acc = balanced_accuracy_score(y_valid, test_predictions)  # Calculating balanced accuracy
        avg_accuracy += balanced_acc
        print(f"Precisión test fold {fold + 1}: {balanced_acc}")

    avg_accuracy /= FOLDS
    ranking_dic_avg[f'{name}'] = avg_accuracy

    defec_accu.append(round(avg_accuracy * 100, 2))
    print(f'Avg. accuracy = {avg_accuracy * 100.0 :.2f}\n')


Logistic Regression :

Precisión test fold 1: 0.863414491003907
Precisión test fold 2: 0.8516839378238342
Precisión test fold 3: 0.858160621761658
Precisión test fold 4: 0.8474740932642487
Precisión test fold 5: 0.846502590673575
Avg. accuracy = 85.34

Perceptron :

Precisión test fold 1: 0.6237426010698056
Precisión test fold 2: 0.7490284974093264
Precisión test fold 3: 0.7363989637305699
Precisión test fold 4: 0.7697538860103628
Precisión test fold 5: 0.6881476683937824
Avg. accuracy = 71.34

SVC(Support Vector Classifier) :

Precisión test fold 1: 0.8922076479366836
Precisión test fold 2: 0.8856865284974094
Precisión test fold 3: 0.8895725388601037
Precisión test fold 4: 0.8792098445595855
Precisión test fold 5: 0.873380829015544
Avg. accuracy = 88.40

Ada Boost Classifier :

Precisión test fold 1: 0.9394633365192749
Precisión test fold 2: 0.939119170984456
Precisión test fold 3: 0.9436528497409327
Precisión test fold 4: 0.9387953367875648
Precisión test fold 5: 0.9394430051813472
A

In [18]:
ranking_dic_avg = sorted(ranking_dic_avg.items(), key=lambda x: x[1], reverse=True)

In [19]:
for i in range(0,9):
  print(f'{ranking_dic[i]}    |     {ranking_dic_avg[i]}')

('XGB Classifier', 0.9787619787619788)    |     ('XGB Classifier', 0.9775275835471268)
('Random Forest Classifier', 0.9764309764309764)    |     ('Random Forest Classifier', 0.97318854067106)
('Gradient Boosting Classifier', 0.9658119658119658)    |     ('Gradient Boosting Classifier', 0.9628912420141859)
('XGBRF Classifier', 0.957005957005957)    |     ('XGBRF Classifier', 0.956933028153663)
('Ada Boost Classifier', 0.9399119399119399)    |     ('Ada Boost Classifier', 0.940094739842715)
('K Neighbors Classifier', 0.9127169127169127)    |     ('K Neighbors Classifier', 0.9002002112782334)
('SVC(Support Vector Classifier)', 0.8852628852628852)    |     ('SVC(Support Vector Classifier)', 0.8840114777738652)
('Logistic Regression', 0.8482258482258482)    |     ('Logistic Regression', 0.8534471469054445)
('Perceptron', 0.7062937062937062)    |     ('Perceptron', 0.7134143233227694)


In [20]:
# Define random search grids for each model
xgb_param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_lambda': [0, 1, 5, 10],
    'reg_alpha': [0, 1, 5, 10]
}

gradient_boosting_param_grid = {
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None]
}


random_forest_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
}

# Define classifiers
clfs_bests = [
    XGBClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
    ]

# Define classifier names
names_bests = [
    'XGB Classifier',
    'Random Forest Classifier',
    'Gradient Boosting Classifier',
             ]


In [22]:
# List of random search grids for each model
grid_params_best_models = [xgb_param_grid,
                           random_forest_param_grid,
                           gradient_boosting_param_grid
                          ]
best_params = []

# Perform random search with cross-validation for each model
for clfi, name, params in zip(clfs_bests, names_bests, grid_params_best_models):
    print(f'{name} :\n')
    avg_accuracy = 0

    random_search = RandomizedSearchCV(estimator=clfi, param_distributions=params, n_iter=5, cv=5)
    random_search.fit(X_train_scaled, y_train)

    # Print the best hyperparameters and best score for each model
    print(f"Best hyperparameters for  {name}:")

    print(random_search.best_params_,'\n')
    best_params.append(random_search.best_params_)

    print(f"Best cross-validation score: {random_search.best_score_ * 100.0 :.2f}")

    # Obtener el modelo con los mejores hiperparámetros
    best_model_classifier = random_search.best_estimator_

    # Evaluar el modelo en el conjunto de prueba
    accuracy = best_model_classifier.score(X_test_scaled, y_test)
    print(f"Test set accuracy:  {accuracy * 100.0 :.2f}\n")


XGB Classifier :

Best hyperparameters for  XGB Classifier:
{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 5, 'n_estimators': 150, 'min_child_weight': 3, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'colsample_bynode': 1.0} 

Best cross-validation score: 97.48
Test set accuracy:  97.38

Random Forest Classifier :

Best hyperparameters for  Random Forest Classifier:
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'criterion': 'gini', 'bootstrap': False} 

Best cross-validation score: 97.31
Test set accuracy:  97.46

Gradient Boosting Classifier :

Best hyperparameters for  Gradient Boosting Classifier:
{'subsample': 0.9, 'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 7, 'learning_rate': 0.05} 

Best cross-validation score: 97.45
Test set accuracy:  97.33



In [23]:
# Define the provided hyperparameters

xgb_params = {'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha': 5, 'n_estimators': 150, 'min_child_weight': 7, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.9, 'colsample_bynode': 1.0}
rf_params = {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
gb_params = {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 7, 'learning_rate': 0.05}

# Initialize the classifiers with the custom hyperparameters.
clfs = [
    XGBClassifier(**xgb_params),
    RandomForestClassifier(**rf_params),
    GradientBoostingClassifier(**gb_params)
]
names = [
    'XGB Classifier BEST',
    'Random Forest Classifier BEST',
    'Gradient Boosting Classifier BEST',
]

ranking_dic={}

for clf,name in zip(clfs,names):

    print(name)

    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    conf_matrix = confusion_matrix(y_test, y_pred)
    clas_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    ranking_dic[f'{name}'] = accuracy

    modelos_dic[f'{name}'] = {'conf_matrix' : conf_matrix}
    modelos_dic[f'{name}'].update({'accuracy' : accuracy})


    print(f'{clas_report}\n\n Confusion Matrix \n {conf_matrix} \n\n Balanced Accuracy Score: {accuracy}')
    print(f'--------------------------------------------------------------------------------------------------')


XGB Classifier BEST
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1931
           1       0.97      0.98      0.97      1930

    accuracy                           0.97      3861
   macro avg       0.97      0.97      0.97      3861
weighted avg       0.97      0.97      0.97      3861


 Confusion Matrix 
 [[1875   56]
 [  44 1886]] 

 Balanced Accuracy Score: 0.9740999740999741
--------------------------------------------------------------------------------------------------
Random Forest Classifier BEST
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1931
           1       0.97      0.98      0.97      1930

    accuracy                           0.97      3861
   macro avg       0.97      0.97      0.97      3861
weighted avg       0.97      0.97      0.97      3861


 Confusion Matrix 
 [[1873   58]
 [  42 1888]] 

 Balanced Accuracy Score: 0.9740999740999741
-------

In [24]:
modelos_dic

{'Logistic Regression': {'conf_matrix': array([[1820,  111],
         [ 475, 1455]]),
  'accuracy': 0.8482258482258482},
 'Perceptron': {'conf_matrix': array([[1317,  614],
         [ 520, 1410]]),
  'accuracy': 0.7062937062937062},
 'SVC(Support Vector Classifier)': {'conf_matrix': array([[1780,  151],
         [ 292, 1638]]),
  'accuracy': 0.8852628852628852},
 'Ada Boost Classifier': {'conf_matrix': array([[1814,  117],
         [ 115, 1815]]),
  'accuracy': 0.9399119399119399},
 'Gradient Boosting Classifier': {'conf_matrix': array([[1867,   64],
         [  68, 1862]]),
  'accuracy': 0.9658119658119658},
 'XGB Classifier': {'conf_matrix': array([[1881,   50],
         [  32, 1898]]),
  'accuracy': 0.9787619787619788},
 'XGBRF Classifier': {'conf_matrix': array([[1853,   78],
         [  88, 1842]]),
  'accuracy': 0.957005957005957},
 'Random Forest Classifier': {'conf_matrix': array([[1883,   48],
         [  43, 1887]]),
  'accuracy': 0.9764309764309764},
 'K Neighbors Classifier

In [25]:
modelos_dic['Logistic Regression']['accuracy']

0.8482258482258482

In [26]:
import json
import numpy as np

for classifier in modelos_dic:
  modelos_dic[classifier]['conf_matrix'] = modelos_dic[classifier]['conf_matrix'].tolist()

# Convert the confusion matrices from numpy arrays to lists
with open('modelos_dic.json', 'w') as json_file:
  json.dump(modelos_dic, json_file, indent=4)

In [27]:
# Save the dictionary as JSON

with open('modelos_dic.json', 'r') as json_file:
    modelos_dic_cargado = json.load(json_file)

In [28]:
MODELS_DATA_NAMES = []
MODELS_DATA_ACCURACY =[]

for key,values in modelos_dic_cargado.items():
  MODELS_DATA_NAMES.append(key)
  MODELS_DATA_ACCURACY.append(values['accuracy'])

modelo_precisión = list(zip(MODELS_DATA_NAMES, MODELS_DATA_ACCURACY))
modelo_precisión_ordenado = sorted(modelo_precisión, key=lambda x: x[1], reverse=True)

In [29]:
for i in range (0,12):
  print(MODELS_DATA_NAMES[i],MODELS_DATA_ACCURACY[i])

Logistic Regression 0.8482258482258482
Perceptron 0.7062937062937062
SVC(Support Vector Classifier) 0.8852628852628852
Ada Boost Classifier 0.9399119399119399
Gradient Boosting Classifier 0.9658119658119658
XGB Classifier 0.9787619787619788
XGBRF Classifier 0.957005957005957
Random Forest Classifier 0.9764309764309764
K Neighbors Classifier 0.9127169127169127
XGB Classifier BEST 0.9740999740999741
Random Forest Classifier BEST 0.9740999740999741
Gradient Boosting Classifier BEST 0.9725459725459725


In [30]:
# Sort the models by accuracy (from highest to lowest)
modelos_ordenados = sorted(modelos_dic_cargado.items(), key=lambda x: x[1]['accuracy'], reverse=True)

# List of models sorted by accuracy
modelo_precision_ordenado2 = [(nombre, datos['accuracy']) for nombre, datos in modelos_ordenados]

print("Modelos ordenados por precisión:")
print(modelo_precision_ordenado2)


Modelos ordenados por precisión:
[('XGB Classifier', 0.9787619787619788), ('Random Forest Classifier', 0.9764309764309764), ('XGB Classifier BEST', 0.9740999740999741), ('Random Forest Classifier BEST', 0.9740999740999741), ('Gradient Boosting Classifier BEST', 0.9725459725459725), ('Gradient Boosting Classifier', 0.9658119658119658), ('XGBRF Classifier', 0.957005957005957), ('Ada Boost Classifier', 0.9399119399119399), ('K Neighbors Classifier', 0.9127169127169127), ('SVC(Support Vector Classifier)', 0.8852628852628852), ('Logistic Regression', 0.8482258482258482), ('Perceptron', 0.7062937062937062)]
