In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv', encoding = 'utf-8')
df.head()

In [None]:
df.info()

In [None]:
data = df.copy()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(30,10))
sns.heatmap(df.corr() , annot = True, cmap = "Blues")

In [None]:
df.plot(kind='box', subplots = True, figsize = (20,20), layout = (7,7))

In [None]:
df['Class'].value_counts()

## Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix , classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import KFold , cross_val_score #cross validation
from sklearn.model_selection import GridSearchCV

sc = StandardScaler()
x = df.drop(['Class'], axis = 1)
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

### Random UnderSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=1) 
x_res, y_res = rus.fit_resample(x_train, y_train)

#before Random undersampling
print(y_train.value_counts())
#after Random undersampling
print(y_res.value_counts())

### Random OverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(sampling_strategy='minority')
x_ran, y_ran = os.fit_resample(x_train, y_train)

#before random oversampling
print(y_train.value_counts())
#after random oversampling
print(y_ran.value_counts())

### LogReg

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)

#LogReg
logreg.fit(x_train, y_train)
print(accuracy_score(y_test, logreg.predict(x_test)))
#print(logreg.score(x_train, y_train))
#print(logreg.score(x_test, y_test))
print()

#OverSampling
logreg.fit(x_ran, y_ran)
print(accuracy_score(y_test, logreg.predict(x_test)))
print()

#UnderSampling
logreg.fit(x_res, y_res)
print(accuracy_score(y_test, logreg.predict(x_test)))

In [None]:
#cross validation
K_fold = KFold(n_splits= 3 , shuffle  = True ,random_state= 42)
scoring = "accuracy"
score = cross_val_score(logreg, x, y, cv = K_fold , scoring= scoring)
print(score)
print(round(np.mean(score)*100 , 2))

In [None]:
#Grid Search
pram_grid = {'C': [i for i in range(10)], 'fit_intercept': [True, False]}
pram_grid

In [None]:
grid = GridSearchCV(estimator = logreg, param_grid = pram_grid, cv = 5)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model.predict(x_test)

In [None]:
#confusion matrix
con = confusion_matrix(y_test, model.predict(x_test))
plot_confusion_matrix(con)
#classification report
print (classification_report(y_test , model.predict(x_test)))

## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 5, max_depth=5 ,max_features= 17)

#RF
rf.fit(x_train, y_train)
print(accuracy_score(y_test, rf.predict(x_test)))
#print(rf.score(x_train, y_train))
#print(rf.score(x_test, y_test))
print()

#OverSampling
rf.fit(x_ran, y_ran)
print(accuracy_score(y_test, rf.predict(x_test)))
print()

#UnderSampling
rf.fit(x_res, y_res)
print(accuracy_score(y_test, rf.predict(x_test)))


In [None]:
#cross validation
K_fold = KFold(n_splits= 3 , shuffle  = True ,random_state= 42)
scoring = "accuracy"
score = cross_val_score(rf, x, y, cv = K_fold , scoring= scoring)
print(score)
print(round(np.mean(score)*100 , 2))

In [None]:
#Grid Search
pram_grid = {'max_depth': [1,2,3], 'max_features': [7,8,9]}
pram_grid

In [None]:
grid = GridSearchCV(estimator = rf, param_grid = pram_grid, cv = 5)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model.predict(x_test)

In [None]:
#confusion matrix
con = confusion_matrix(y_test, model.predict(x_test))
plot_confusion_matrix(con)
#classification report
print (classification_report(y_test , model.predict(x_test)))

### DT

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth= 5, max_features= 17)

#DT
dt.fit(x_train, y_train)
print(accuracy_score(y_test, dt.predict(x_test)))
#print(dt.score(x_train, y_train))
#print(dt.score(x_test, y_test))
print()

#OverSampling
dt.fit(x_ran, y_ran)
print(accuracy_score(y_test, dt.predict(x_test)))
print()

#UnderSampling
dt.fit(x_res, y_res)
print(accuracy_score(y_test, dt.predict(x_test)))

In [None]:
#cross validation
K_fold = KFold(n_splits = 3 , shuffle = True ,random_state= 42)
scoring = "accuracy"
score = cross_val_score(dt, x, y, cv = K_fold , scoring= scoring)
print(score)
print(round(np.mean(score)*100 , 2))

In [None]:
#Grid Search
pram_grid = {'max_depth': [1,2,3], 'max_features': [7,8,9]}
pram_grid

In [None]:
grid = GridSearchCV(estimator = dt, param_grid = pram_grid, cv = 5)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model.predict(x_test)

#confusion matrix
con = confusion_matrix(y_test, model.predict(x_test))
plot_confusion_matrix(con)
#classification report
print (classification_report(y_test , model.predict(x_test)))

In [None]:
#feature importance
pd.Series(dt.feature_importances_, index=x.columns).plot(kind='barh')

### XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators = 15, max_depth = 3)

#XGB
xgb.fit(x_train, y_train)
print(accuracy_score(y_test, xgb.predict(x_test)))
#print(xgb.score(x_train, y_train))
#print(xgb.score(x_test, y_test))
print()

#OverSampling
xgb.fit(x_ran, y_ran)
print(accuracy_score(y_test, xgb.predict(x_test)))
print()

#UnderSampling
xgb.fit(x_res, y_res)
print(accuracy_score(y_test, xgb.predict(x_test)))

In [None]:
#cross validation
K_fold = KFold(n_splits = 3 , shuffle = True ,random_state= 42)
scoring = "accuracy"
score = cross_val_score(xgb, x, y, cv = K_fold , scoring= scoring)
print(score)
print(round(np.mean(score)*100 , 2))

In [None]:
#Grid Search
pram_grid = {'max_depth': [1,2,3], 'n_estimators': [7,8,9]}
pram_grid

In [None]:
xgb.get_params()

In [None]:
grid = GridSearchCV(estimator = xgb, param_grid = pram_grid, cv = 5)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model.predict(x_test)

#confusion matrix
con = confusion_matrix(y_test, model.predict(x_test))
plot_confusion_matrix(con)
#classification report
print (classification_report(y_test , model.predict(x_test)))

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

#ADA
ada.fit(x_train, y_train)
print(accuracy_score(y_test, ada.predict(x_test)))
#print(ada.score(x_train, y_train))
#print(ada.score(x_test, y_test))
print()

#OverSampling
ada.fit(x_ran, y_ran)
print(accuracy_score(y_test, ada.predict(x_test)))
print()

#UnderSampling
ada.fit(x_res, y_res)
print(accuracy_score(y_test, ada.predict(x_test)))

In [None]:
#cross validation
K_fold = KFold(n_splits = 3 , shuffle = True ,random_state= 42)
scoring = "accuracy"
score = cross_val_score(ada, x, y, cv = K_fold , scoring= scoring)
print(score)
print(round(np.mean(score)*100 , 2))

In [None]:
#Grid Search
pram_grid = {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [7,8,9]}
pram_grid

In [None]:
ada.get_params()

In [None]:
grid = GridSearchCV(estimator = ada, param_grid = pram_grid, cv = 5)
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
model = grid_result.best_estimator_
model.predict(x_test)

#confusion matrix
con = confusion_matrix(y_test, model.predict(x_test))
plot_confusion_matrix(con)
#classification report
print (classification_report(y_test , model.predict(x_test)))

### Voting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression()
clf2 = AdaBoostClassifier()
clf3 = XGBClassifier(n_estimators = 15, max_depth = 3)
clf4 = RandomForestClassifier(n_estimators= 5, max_depth=5 ,max_features= 17)

In [None]:
v_clf = VotingClassifier(estimators=[('logreg', clf1), ('ada', clf2), ('xgb', clf3), ('RF', clf4)], voting='soft')
v_clf.fit(x_train, y_train)

In [None]:
#soft voting
print(v_clf.score(x_train, y_train))
print(v_clf.score(x_test, y_test))

In [None]:
v_clf = VotingClassifier(estimators=[('logreg', clf1), ('ada', clf2), ('xgb', clf3), ('RF', clf4)], voting='hard')
v_clf.fit(x_train, y_train)

In [None]:
#hard voting
print(v_clf.score(x_train, y_train))
print(v_clf.score(x_test, y_test))