In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('creditcard.csv')

In [None]:
#null valuew
df.isnull().sum()

In [None]:
#class distribution
sns.countplot('Class',data=df)
plt.title('class distribution')

In [None]:
#--Distribution of Transaction Time and Transaction Money
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(18,4))
sns.distplot(df['Time'].values,ax=axes[0])
axes[0].set_title('Distribution of Transaction Time')
axes[0].set_xlim([min(df['Time'].values), max(df['Time'].values)])

sns.distplot(df['Amount'].values,ax=axes[1],color='r')
axes[1].set_title('Distribution of Transaction Amount')
axes[1].set_xlim([min(df['Amount'].values), max(df['Amount'].values)])

plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler,RobustScaler
rob_scaler=RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [None]:
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

df.head()

In [None]:
#correlation with Class
index=df.corrwith(df['Class']).sort_values(ascending=False).index
plt.figure(figsize=(25,30))
sns.barplot(x=df.corrwith(df['Class']).sort_values(ascending=False),y=index,)

In [None]:
#boxplot of highest neative correlaton
f, axes = plt.subplots(ncols=2,nrows=2 ,figsize=(20,20))

sns.boxplot(x="Class", y="V17", data=df, ax=axes[0,0])
axes[0,0].set_title('V17 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=df, ax=axes[0,1])
axes[0,1].set_title('V14 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V12", data=df, ax=axes[1,0])
axes[1,0].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V10", data=df, ax=axes[1,1])
axes[1,1].set_title('V10 vs Class Negative Correlation')

plt.show()

In [None]:
# # -----> V14 Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud = df['V14'].loc[df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
v14_iqr = q75 - q25

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
df = df.drop(df[(df['V14'] > v14_upper) | (df['V14'] < v14_lower)].index)


In [None]:
# # -----> V12 Removing Outliers (Highest Negative Correlated with Labels)
v12_fraud = df['V12'].loc[df['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25

v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
df = df.drop(df[(df['V12'] > v12_upper) | (df['V12'] < v12_lower)].index)


In [None]:
# # -----> V10 Removing Outliers (Highest Negative Correlated with Labels)
v10_fraud = df['V10'].loc[df['Class'] == 1].values
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25

v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
df = df.drop(df[(df['V10'] > v10_upper) | (df['V10'] < v10_lower)].index)


In [None]:
#boxplot after removal of outliers
f, axes = plt.subplots(ncols=1,nrows=3 ,figsize=(20,20))

sns.boxplot(x="Class", y="V14", data=df, ax=axes[0])
axes[0].set_title('V14 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V12", data=df, ax=axes[1])
axes[1].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V10", data=df, ax=axes[2])
axes[2].set_title('V10 vs Class Negative Correlation')

plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
import xgboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
accuracy={}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold

X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]


X_train = original_Xtrain.values
X_test = original_Xtest.values
y_train = original_ytrain.values
y_test = original_ytest.values

In [None]:
sns.barplot(x=['train','test'],y=[(y_train==1).sum()/y_train.shape[0],(y_test==1).sum()/y_test.shape[0]])

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['logisticRegression']=[acc,con]

In [None]:
sgd=SGDClassifier()
sgd.fit(X_train,y_train)
y_pred=sgd.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['sgdClassifier']=[acc,con]


In [None]:
xgb=xgboost.XGBClassifier(max_depth=3,n_estimators=100)
xgb.fit(X_train,y_train)
y_pred=xgb.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['xgbclassifier']=[acc,con]

In [None]:
randomforestclassifier=RandomForestClassifier()
randomforestclassifier.fit(X_train,y_train)
y_pred=randomforestclassifier.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['randomforest']=[acc,con]

In [None]:
adaboost=AdaBoostClassifier()
adaboost.fit(X_train,y_train)
y_pred=adaboost.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['adaboost']=[acc,con]


In [None]:
gradientboosting=GradientBoostingClassifier()
gradientboosting.fit(X_train,y_train)
y_pred=gradientboosting.predict(X_test)
acc=accuracy_score(y_test,y_pred)
con=confusion_matrix(y_test,y_pred)
accuracy['gradientboosting']=[acc,con]

In [None]:
#accuracy of all model
accuracy

In [None]:
acc=[]

In [None]:
acc=[accuracy['logisticRegression'][0],accuracy['sgdClassifier'][0],accuracy['xgbclassifier'][0],accuracy['randomforest'][0],accuracy['adaboost'][0],accuracy['gradientboosting'][0]]

In [None]:
sns.barplot(x=['logisticRegression','sgdClassifier','xgbclassifier','randomforest','adaboost','gradientboosting'],y=acc)

In [None]:
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy

n_inputs = X_train.shape[1]

model = Sequential([
    Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2)

In [None]:
y_pred=model.predict(X_test,batch_size=200)
y_p=y_pred>0.5

In [None]:
sns.heatmap(confusion_matrix(y_test,y_p), xticklabels=['No Fraud', 'Fraud'],yticklabels=['No Fraud', 'Fraud'],annot=True,cbar=True, cmap=plt.cm.Oranges)

In [None]:
accuracy_score(y_test,y_p)