In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [3]:
df=pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [4]:
df.head().style.background_gradient()

In [None]:
#df.isnull().sum()

In [5]:
df['Class'].value_counts()

In [None]:
#df.info()

In [6]:
sns.countplot(x='Class',data=df,palette='GnBu')

In [7]:
sns.barplot(x='Class',y='Amount',data=df,palette='PuRd')

In [8]:
sns.barplot(x='Class',y='Time',data=df,palette='Purples')

In [10]:
#seperating data for analysis
normal = df[df.Class == 0]
fraud = df[df.Class == 1]

In [11]:
print(normal.shape)
print(fraud.shape)

In [12]:
#analyzing Amount column
normal.Amount.describe()

In [13]:
fraud.Amount.describe()

# Under Samping

--we can also perform like this
normal_sample = normal.sample(n=492)
-now combine these two columns 
--data = pd.concat([normal_sample, fraud], axis=0)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [15]:
X= df.drop('Class',axis=1)
y= df.Class

In [None]:
#X1 = np.array(X)
#y1 = np.array(y)  

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=101)

In [17]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler 

In [18]:
ns=RandomUnderSampler(0.8) #reduce it 0.8
X_train_ns,y_train_ns = ns.fit_resample(X_train,y_train)
print('the number of classes before the fit {}'.format(Counter(y_train)))
print('the number of classes after the fit {}'.format(Counter(y_train_ns)))

In [19]:
from sklearn.ensemble import RandomForestClassifier as RFC
cls= RFC()
cls.fit(X_train_ns,y_train_ns)

In [20]:
ns=RandomUnderSampler(0.8) #reduce it 0.8
X_test_ns,y_test_ns = ns.fit_resample(X_test,y_test)

In [21]:
#checking the shape of test data
print(y_test_ns.shape)
print('\n\n')
print(X_test_ns.shape)

In [22]:
y_pred1 = cls.predict(X_test_ns)
print(classification_report(y_test_ns,y_pred1))
print(accuracy_score(y_test_ns,y_pred1))

In [23]:
classes = ('Normal', 'Fraud')
classes

In [57]:
cm = confusion_matrix(y_test_ns,y_pred1)
fig,ax= plt.subplots(figsize=(10,6))#figsize=(10,6)
a=sns.color_palette("winter_r")  #_r inverts the palette
sns.heatmap(cm, annot=True,fmt='g',linewidths=1,linecolor='white',robust=True,annot_kws={"size":18},cmap=a)
 #annot_kws: settings about annotations
ax.xaxis.set_ticklabels(classes); ax.yaxis.set_ticklabels(classes);
plt.yticks(va="center")
plt.title('Confusion Matrix',fontsize=18,pad=18)
plt.xlabel('Actual class',labelpad=22,fontsize=14)
plt.ylabel('Predicted class',labelpad=22,fontsize=14)

In [74]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
bnb = BernoulliNB()

In [66]:
bnb.fit(X_train_ns,y_train_ns)

In [68]:
y_pred1=bnb.predict(X_test_ns)

In [69]:
print(classification_report(y_test_ns,y_pred1))

In [70]:
#finding the optimal threshold value
pred = []
for model in [cls,bnb]:
    pred.append(pd.Series(model.predict_proba(X_test_ns)[:,1]))
    
    final_prediction = pd.concat(pred, axis=1).mean(axis=1)
    
    print('Test ROC-AUC: {}'.format(roc_auc_score(y_test_ns,y_pred1)))

In [71]:
pd.concat(pred,axis=1)

In [72]:
final_prediction

In [73]:
#roc curve-threhold values
fpr, tpr, thresholds = roc_curve(y_test_ns,final_prediction)
thresholds

In [77]:
#now im calculating the accuracy score for threshold value
accuracy = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy.append(accuracy_score(y_test_ns,y_pred,normalize=True))
    
accuracy = pd.concat([pd.Series(thresholds),pd.Series(accuracy)],axis=1)
accuracy.columns=['thresholds','accuracy']
accuracy.sort_values(by='accuracy',ascending=False,inplace=True)
accuracy.head()

In [82]:
#plotting the roc curve using the fpr and tpr values 
def plot_curve(fpr,tpr):
    plt.plot(fpr,tpr,color='orange',label='ROC')
    plt.plot([0,1],[0,1], color='darkblue') #linestyle = '-'
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()

In [83]:
plot_curve(fpr,tpr)

# OverSampling

In [32]:
from sklearn.utils import resample

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=41)

In [34]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

In [35]:
#Seperating fraud and normal transaction
not_fraud = X[X.Class==0]
fraud = X[X.Class==1]

In [36]:
print(not_fraud.shape)
print(fraud.shape)

In [37]:
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in normal/not_fraud sample
                          random_state=27) # reproducible results

In [38]:
upsampled = pd.concat([not_fraud, fraud_upsampled])

In [39]:
# check new class counts
upsampled.Class.value_counts()

In [40]:
upsampled.shape

In [41]:
#
X_train = upsampled.drop('Class', axis=1)
y_train = upsampled.Class

In [42]:
cls.fit(X_train,y_train)

In [43]:
y_pred = cls.predict(X_test)

In [44]:
print(classification_report(y_test,y_pred))

# Using Smote

In [46]:
from imblearn.over_sampling import SMOTE

In [47]:
X = df.drop('Class', axis=1)
y = df.Class

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=41)

In [49]:
oversample = SMOTE()

In [50]:
X_train, y_train =oversample.fit_resample(X_train, y_train)

In [51]:
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier() #max_depth=2, random_state=42,n_estimators=10

In [52]:
cls.fit(X_train,y_train)

In [53]:
y_pred = cls.predict(X_test)

In [54]:
print(classification_report(y_test,y_pred))
''''''print('\n\n')
print(accuracy_score(y_test,y_pred))''''''

In [56]:
#print("Train accuracy",rfc.score(X_train,y_train))