# Fraud detection

In [1]:
#load packages
import sys
import pandas as pd
import matplotlib 
import numpy as np 
import scipy as sp 
import sklearn 
#misc libraries
import random
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.ensemble import RandomForestClassifier
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection as ms
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.random_projection import GaussianRandomProjection
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
%config InlineBackend.figure_format='retina'

In [3]:
df = pd.read_csv('creditcard.csv')
df.info()
df.head() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
df.drop(['Time'], axis=1, inplace=True)

In [6]:
df.shape

(284807, 30)

In [7]:
selected_vars =['V4','V14','V10','V16','V9','V12','V11','V17','V7','V21','V18','V8','V1','V3','V6','V2','V27','V5','V26']

In [8]:
df1=df[selected_vars]
Target=df['Class']
df.drop('Class',axis=1, inplace=True)

In [9]:
df1.head()

Unnamed: 0,V4,V14,V10,V16,V9,V12,V11,V17,V7,V21,V18,V8,V1,V3,V6,V2,V27,V5,V26
0,1.378155,-0.311169,0.090794,-0.470401,0.363787,-0.617801,-0.5516,0.207971,0.239599,-0.018307,0.025791,0.098698,-1.359807,2.536347,0.462388,-0.072781,0.133558,-0.338321,-0.189115
1,0.448154,-0.143772,-0.166974,0.463917,-0.255425,1.065235,1.612727,-0.114805,-0.078803,-0.225775,-0.183361,0.085102,1.191857,0.16648,-0.082361,0.266151,-0.008983,0.060018,0.125895
2,0.37978,-0.165946,0.207643,-2.890083,-1.514654,0.066084,0.624501,1.109969,0.791461,0.247998,-0.121359,0.247676,-1.358354,1.773209,1.800499,-1.340163,-0.055353,-0.503198,-0.139097
3,-0.863291,-0.287924,-0.054952,-1.059647,-1.387024,0.178228,-0.226487,-0.684093,0.237609,-0.1083,1.965775,0.377436,-0.966272,1.792993,1.247203,-0.185226,0.062723,-0.010309,-0.221929
4,0.403034,-1.11967,0.753074,-0.451449,0.817739,0.538196,-0.822843,-0.237033,0.592941,-0.009431,-0.038195,-0.270533,-1.158233,1.548718,0.095921,0.877737,0.219422,-0.407193,0.502292


Final Dataframe

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 19 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V4      284807 non-null  float64
 1   V14     284807 non-null  float64
 2   V10     284807 non-null  float64
 3   V16     284807 non-null  float64
 4   V9      284807 non-null  float64
 5   V12     284807 non-null  float64
 6   V11     284807 non-null  float64
 7   V17     284807 non-null  float64
 8   V7      284807 non-null  float64
 9   V21     284807 non-null  float64
 10  V18     284807 non-null  float64
 11  V8      284807 non-null  float64
 12  V1      284807 non-null  float64
 13  V3      284807 non-null  float64
 14  V6      284807 non-null  float64
 15  V2      284807 non-null  float64
 16  V27     284807 non-null  float64
 17  V5      284807 non-null  float64
 18  V26     284807 non-null  float64
dtypes: float64(19)
memory usage: 41.3 MB


In [11]:
sampling_res=pd.DataFrame(columns=['name','Train shape','normal percent','fraud percent','accuracy','precision','recall','f1-score','tn', 'fp', 'fn', 'tp'])
sampling_c=0

In [108]:
#original dataset
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(df, Target, test_size = 0.3,random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(199364, 29) (199364,)
(85443, 29) (85443,)


In [13]:
gbm_grid = {
    'is_unbalance':[False],
    'n_estimators': [200,500,1000,2000,3000,5000],
    'learning_rate': [0.1,0.05, 0.01],
    'max_depth': [2,4, 6,14,16,18],
    'colsample_bytree': np.linspace(0.6, 1.0, num=5),
    'reg_alpha': np.linspace(0., 1.0, num=5),
    'subsample': np.linspace(0.6, 0.9, num=4), 
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'metric':['f1'],
    'num_leaves': [5,10,15], 
    'max_depth': [4],  
    'min_child_samples': [50,100,150],
    'max_bin': [50,100,150],  
    'subsample_freq': [1],  
    'colsample_bytree': [0.7],  
    'min_child_weight': [0,0.001,0.01],   
    'verbose': [0], 
}

In [14]:
grid = GridSearchCV(LGBMClassifier(n_jobs=-1, random_state=0), param_grid=gbm_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print(f'Best parameters: {grid.best_params_}')

print(grid.best_estimator_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
You can set `force_col_wise=true` to remove the overhead.
Best parameters: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'is_unbalance': False, 'learning_rate': 0.01, 'max_bin': 100, 'max_depth': 4, 'metric': 'f1', 'min_child_samples': 100, 'min_child_weight': 0, 'min_split_gain': 0, 'n_estimators': 1000, 'num_leaves': 7, 'num_threads': -1, 'objective': 'binary', 'reg_alpha': 1.0, 'scale_pos_weight': 150, 'subsample': 0.7, 'subsample_freq': 1, 'verbose': 0}
LGBMClassifier(colsample_bytree=0.7, is_unbalance=False, learning_rate=0.01,
               max_bin=100, max_depth=4, metric='f1', min_child_samples=100,
               min_child_weight=0, min_split_gain=0, n_estimators=1000,
               num_leaves=7, num_threads=-1, objective='binary', random_state=0,
               reg_alpha=1.0, scale_pos_weight=150, subsample=0.7,
               subsample_freq=1, verbose=0)


In [167]:
%%time
LGBM = LGBMClassifier(colsample_bytree=0.7, is_unbalance=False, learning_rate=0.01,num_iterations=200,
               max_bin=100, max_depth=16, metric='f1', min_child_samples=100,
               min_child_weight=0,n_estimators=5000,
               num_leaves=1000, random_state=0,
                subsample_freq=0);
LGBM.fit(X_train, y_train)
preds=LGBM.predict(X_test)
print(classification_report(y_test, preds))
#1       0.95      0.76      0.85

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.97      0.77      0.86       147

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.93     85443
weighted avg       1.00      1.00      1.00     85443

Wall time: 13.4 s


In [14]:
X_train = X_train[selected_vars] #select the variables from variable selection proces
X_test = X_test[selected_vars]

In [15]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(199364, 19) (199364,)
(85443, 19) (85443,)


In [19]:
grid = GridSearchCV(LGBMClassifier(n_jobs=-1, random_state=0), param_grid=gbm_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print(f'Best parameters: {grid.best_params_}')

print(grid.best_estimator_)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
You can set `force_col_wise=true` to remove the overhead.
Best parameters: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'is_unbalance': False, 'learning_rate': 0.01, 'max_bin': 100, 'max_depth': 4, 'metric': 'f1', 'min_child_samples': 100, 'min_child_weight': 0, 'min_split_gain': 0, 'n_estimators': 1000, 'num_leaves': 7, 'num_threads': -1, 'objective': 'binary', 'reg_alpha': 0.75, 'scale_pos_weight': 150, 'subsample': 0.7, 'subsample_freq': 1, 'verbose': 0}
LGBMClassifier(colsample_bytree=0.7, is_unbalance=False, learning_rate=0.01,
               max_bin=100, max_depth=4, metric='f1', min_child_samples=100,
               min_child_weight=0, min_split_gain=0, n_estimators=1000,
               num_leaves=7, num_threads=-1, objective='binary', random_state=0,
               reg_alpha=0.75, scale_pos_weight=150, subsample=0.7,
               subsample_freq=1, verbose=0)


In [106]:
%%time
LGBM = LGBMClassifier(colsample_bytree=0.7, is_unbalance=False, learning_rate=0.01,
               max_bin=100, max_depth=16, metric='f1', min_child_samples=100,
               min_child_weight=0.01, n_estimators=750,
               num_leaves=10, random_state=0,
               reg_alpha=0.75,subsample=0.7,subsample_freq=1);
LGBM.fit(X_train, y_train)

Wall time: 7.57 s


LGBMClassifier(colsample_bytree=0.7, is_unbalance=False, learning_rate=0.01,
               max_bin=100, max_depth=16, metric='f1', min_child_samples=100,
               min_child_weight=0.01, n_estimators=750, num_leaves=10,
               random_state=0, reg_alpha=0.75, subsample=0.7, subsample_freq=1)

In [107]:
preds=LGBM.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.94      0.79      0.86       147

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443

