# Credit Fraud Detection 

## loading the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# loading the data
df= pd.read_csv('creditcard.csv')

## understanding the data

In [3]:
df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [4]:
df.shape

(284807, 31)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

## data preprocessing

In [6]:
# splitting features and target variable
X = df.drop(['Class'], axis=1)
y = df['Class']

In [7]:
from sklearn.ensemble import IsolationForest, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier

In [8]:
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = iso_forest.fit_predict(X)

In [9]:
# filtering out the outliers
mask = outliers != -1
X, y =X[mask], y[mask]

## model creation

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as catb

In [11]:
#standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [13]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
pd.options.display.max_columns=None

In [16]:
# Applying SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [17]:
# function to evaluate model 
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return accuracy, precision, recall, f1, tn, fp, fn, tp

# Initialize results list
results = []

### decision tree

In [19]:
dt_params = {'max_depth': [5, 10, 15]}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_params, scoring='accuracy', cv=3, n_jobs=-1)
dt_grid_search.fit(X_train_resampled, y_train_resampled)
dt_best_model = dt_grid_search.best_estimator_
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_tn, dt_fp, dt_fn, dt_tp = evaluate_model(dt_best_model, X_train_resampled, y_train_resampled, X_test, y_test)


In [20]:
results.append({
    'Model': 'DecisionTree',
    'Accuracy': dt_accuracy,
    'Precision': dt_precision,
    'Recall': dt_recall,
    'F1 Score': dt_f1,
    'True Negative': dt_tn,
    'False Positive': dt_fp,
    'False Negative': dt_fn,
    'True Positive': dt_tp
})

### random forest

In [21]:
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_params, scoring='accuracy', cv=3, n_jobs=-1)
rf_grid_search.fit(X_train_resampled, y_train_resampled)
rf_best_model = rf_grid_search.best_estimator_
rf_accuracy, rf_precision, rf_recall, rf_f1, rf_tn, rf_fp, rf_fn, rf_tp = evaluate_model(rf_best_model, X_train_resampled, y_train_resampled, X_test, y_test)

In [22]:
results.append({
    'Model': 'RandomForest',
    'Accuracy': rf_accuracy,
    'Precision': rf_precision,
    'Recall': rf_recall,
    'F1 Score': rf_f1,
    'True Negative': rf_tn,
    'False Positive': rf_fp,
    'False Negative': rf_fn,
    'True Positive': rf_tp
})

### ada boost

In [23]:
ab_params = {'n_estimators': [50, 100]}
ab_grid_search = GridSearchCV(AdaBoostClassifier(), ab_params, scoring='accuracy', cv=3, n_jobs=-1)
ab_grid_search.fit(X_train_resampled, y_train_resampled)
ab_best_model = ab_grid_search.best_estimator_
ab_accuracy, ab_precision, ab_recall, ab_f1, ab_tn, ab_fp, ab_fn, ab_tp = evaluate_model(ab_best_model, X_train_resampled, y_train_resampled, X_test, y_test)



In [24]:
results.append({
    'Model': 'AdaBoost',
    'Accuracy': ab_accuracy,
    'Precision': ab_precision,
    'Recall': ab_recall,
    'F1 Score': ab_f1,
    'True Negative': ab_tn,
    'False Positive': ab_fp,
    'False Negative': ab_fn,
    'True Positive': ab_tp
})

### bagging

In [25]:
bg_params = {'n_estimators': [10, 20]}
bg_grid_search = GridSearchCV(BaggingClassifier(), bg_params, scoring='accuracy', cv=3, n_jobs=-1)
bg_grid_search.fit(X_train_resampled, y_train_resampled)
bg_best_model = bg_grid_search.best_estimator_
bg_accuracy, bg_precision, bg_recall, bg_f1, bg_tn, bg_fp, bg_fn, bg_tp = evaluate_model(bg_best_model, X_train_resampled, y_train_resampled, X_test, y_test)

In [26]:
results.append({
    'Model': 'Bagging',
    'Accuracy': bg_accuracy,
    'Precision': bg_precision,
    'Recall': bg_recall,
    'F1 Score': bg_f1,
    'True Negative': bg_tn,
    'False Positive': bg_fp,
    'False Negative': bg_fn,
    'True Positive': bg_tp
})

### XGBoost

In [27]:
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6]}
xgb_grid_search = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, scoring='accuracy', cv=3, n_jobs=-1)
xgb_grid_search.fit(X_train_resampled, y_train_resampled)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_tn, xgb_fp, xgb_fn, xgb_tp = evaluate_model(xgb_best_model, X_train_resampled, y_train_resampled, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.



In [28]:
results.append({
    'Model': 'XGBoost',
    'Accuracy': xgb_accuracy,
    'Precision': xgb_precision,
    'Recall': xgb_recall,
    'F1 Score': xgb_f1,
    'True Negative': xgb_tn,
    'False Positive': xgb_fp,
    'False Negative': xgb_fn,
    'True Positive': xgb_tp
})

### lightGBM

In [29]:
lgb_params = {'n_estimators': [100, 200], 'num_leaves': [31, 63]}
lgb_grid_search = GridSearchCV(lgb.LGBMClassifier(), lgb_params, scoring='accuracy', cv=3, n_jobs=-1)
lgb_grid_search.fit(X_train_resampled, y_train_resampled)
lgb_best_model = lgb_grid_search.best_estimator_
lgb_accuracy, lgb_precision, lgb_recall, lgb_f1, lgb_tn, lgb_fp, lgb_fn, lgb_tp = evaluate_model(lgb_best_model, X_train_resampled, y_train_resampled, X_test, y_test)

[LightGBM] [Info] Number of positive: 225404, number of negative: 225404
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 450808, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [30]:
results.append({
    'Model': 'LightGBM',
    'Accuracy': lgb_accuracy,
    'Precision': lgb_precision,
    'Recall': lgb_recall,
    'F1 Score': lgb_f1,
    'True Negative': lgb_tn,
    'False Positive': lgb_fp,
    'False Negative': lgb_fn,
    'True Positive': lgb_tp
})

### CatBoost

In [31]:
cb_params = {'iterations': [100, 200], 'depth': [4, 6]}
cb_grid_search = GridSearchCV(catb.CatBoostClassifier(verbose=0), cb_params, scoring='accuracy', cv=3, n_jobs=-1)
cb_grid_search.fit(X_train_resampled, y_train_resampled)
cb_best_model = cb_grid_search.best_estimator_
cb_accuracy, cb_precision, cb_recall, cb_f1, cb_tn, cb_fp, cb_fn, cb_tp = evaluate_model(cb_best_model, X_train_resampled, y_train_resampled, X_test, y_test)

In [32]:
results.append({
    'Model': 'CatBoost',
    'Accuracy': cb_accuracy,
    'Precision': cb_precision,
    'Recall': cb_recall,
    'F1 Score': cb_f1,
    'True Negative': cb_tn,
    'False Positive': cb_fp,
    'False Negative': cb_fn,
    'True Positive': cb_tp
})

## evaluating the models

In [33]:
result_df = pd.DataFrame(results)
result_df.sort_values(by='Accuracy', ascending=False, inplace=True)

In [34]:
result_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,True Negative,False Positive,False Negative,True Positive
5,LightGBM,0.999539,0.692308,0.658537,0.675,56339,12,14,27
4,XGBoost,0.999468,0.634146,0.634146,0.634146,56336,15,15,26
1,RandomForest,0.999078,0.40678,0.585366,0.48,56316,35,17,24
3,Bagging,0.999042,0.381818,0.512195,0.4375,56317,34,20,21
6,CatBoost,0.997305,0.146497,0.560976,0.232323,56217,134,18,23
0,DecisionTree,0.990938,0.049808,0.634146,0.092362,55855,496,15,26
2,AdaBoost,0.974819,0.022837,0.804878,0.044415,54939,1412,8,33
