In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for drawing highly attractive and informative statistical graphics

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [None]:
data= pd.read_csv(r'G:\ML project skyfi\creditcardfraud\creditcard.csv')
data.head()

In [None]:
data.describe().round(decimals=2)

In [None]:
#printing the no. of columns and their names
print('Columns : ', list(data))
print('no. of columns : ',len(list(data)))

In [None]:
#finding the no. of genuine and fraud transaction and showing them in pie chart
n_genuine= len(data[data['Class']==0])
n_fraud= len(data[data['Class']==1])
print('No. of genuine transaction= ',n_genuine)
print('No. of fraud transaction= ',n_fraud)

plt.pie([n_genuine, n_fraud], labels=['Genuine', 'Fraud'], radius=1.5)
plt.show()

In [None]:
#separating the featursand labels 
x=data.iloc[:, :-1]  # x denotes the feature columns
y=data.iloc[:,-1]   #y denotes the labels
x.head()

In [None]:
y.head()

In [None]:
#selecting features using SelectKBest

k = 10 #no. of best features
k_best = SelectKBest(f_classif, k=k) #calculating the scores for each feature using the f_classif score function
k_best.fit(x,y) 

In [None]:
#seeing which feature is among the k-best or not 
mask = k_best.get_support()
mask

In [None]:
not_mask = np.logical_not(mask)
not_mask

In [None]:
#classifying features as best and bad features
all_features = np.array(list(x))

best_features = all_features[mask]
bad_features = all_features[not_mask]

print('Best Features : ', best_features)
print('Bad Features : ', bad_features)

In [None]:
#visualizing the best and bad features using seaborn

def plot_fraud_genuine(features, data):
    plt_index = 0
    plt.figure(figsize=(10,10))
    plt.subplots_adjust(top = 0.99, bottom= 0.01, hspace=1.5, wspace=0.4)
    
    for feature in features:
        plt_index += 1
        feature_data = pd.concat([data[feature], data['Class']], axis=1)
        fraud = feature_data[data['Class']==1]
        genuine = feature_data[data['Class']==0]
        if len(genuine>10000):
            genuine = genuine[::100]
        plt.subplot(5,5, plt_index)
        sns.distplot(fraud[feature])
        sns.distplot(genuine[feature])
        plt.title(feature)

In [None]:
plot_fraud_genuine(best_features, data)

In [None]:
#splitting the data into training and testing dataset
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
#training the model using cross validation because the no. of fraudulant transaction are very less as compared to genuine transactions
nb= GaussianNB()
cv_results= cross_validate(nb, x_train, y_train, cv=10, scoring='recall', return_train_score=True, return_estimator=True)

print('Training scores from each fold: ', cv_results['train_score'])
max_score_index = np.argmax(cv_results['train_score'])
best_estimator = cv_results['estimator'][max_score_index]

In [None]:
#function for plotting the confusion matrix 
def plot_confusion_matrix( cm, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
   

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
#function for displaying the result usinf plot_confusion_matrix function
def display_results(estimator, x, y):
    predicted = estimator.predict(x)
    cm= confusion_matrix(y, predicted)
    report= classification_report(y, predicted)
    print(report)
    plot_confusion_matrix(cm, classes=['Genuine', 'Fraud'], title='Fraud detection')
    

In [None]:
display_results(best_estimator, x_train, y_train)

In [2]:
display_results(best_estimator, x_test, y_test)

NameError: name 'display_results' is not defined