In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Loading dataset
def load_dataset():
    dataset = pd.read_csv('creditcard.csv')
    return dataset

In [3]:
#Shape of dataset
def dataframe_shape():
    return load_dataset().shape

In [4]:
#Check null values
def sum_of_null_values():
    return load_dataset().isnull().sum()

In [5]:
#Check datatypes
def check_datatypes():
    return load_dataset().dtypes

In [6]:
#Describe data
def data_describe():
    return load_dataset().describe()

In [7]:
#Check count of target variable
def check_count_of_target_variable():
    return load_dataset()['Class'].value_counts()

In [8]:
#Correlation matrix
def corr_matrix():
    return load_dataset().corr()

In [9]:
#Plot target count
def plot_target_count():
    return sns.countplot(data=load_dataset(), x='Class')

In [10]:
#Feature scaling
def feature_scaling_amount():
    data = load_dataset()
    sc = StandardScaler()
    scaled = sc.fit_transform(data['Amount'].values.reshape(-1,1))
    data['Amount'] = scaled
    return data

In [11]:
#Drop unnecessary columns
def drop_unnecessary_columns():
    data = load_dataset()
    data.drop('Time',axis=1,inplace=True)
    data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
    return data

In [12]:
#Drop duplicates
def drop_duplicate_data():
    data = drop_unnecessary_columns()
    data = data.drop_duplicates()
    return data

In [13]:
#Feature separating
def feature_separating_x_y():
    data = drop_duplicate_data()
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return X,y

In [14]:
#Data balancing
def data_balancing_smote():
    X,y = feature_separating_x_y()
    X_res, y_res = SMOTE().fit_resample(X,y)
    return X_res, y_res

In [15]:
#Splitting dataset
def splitting_dataset():
    X_res, y_res = data_balancing_smote()
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state = 4)
    return X_train, X_test, y_train, y_test

In [16]:
#Logistic Regression
def fit_logistic_regression():
    X_train, X_test, y_train, y_test = splitting_dataset()
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [17]:
#Linear Discriminant Analysis
def fit_lda():
    X_train, X_test, y_train, y_test = splitting_dataset()
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    y_pred = lda.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [18]:
#KNN
def fit_knn():
    X_train, X_test, y_train, y_test = splitting_dataset()
    knn = KNeighborsClassifier(n_neighbors=4)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [19]:
#Decision Tree
def fit_decision_tree():
    X_train, X_test, y_train, y_test = splitting_dataset()
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [20]:
#Gaussian Naive Bayes
def fit_GaussianNB():
    X_train, X_test, y_train, y_test = splitting_dataset()
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [21]:
#Random Forest
def fit_random_forest():
    X_train, X_test, y_train, y_test = splitting_dataset()
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred)

In [22]:
#Calling all Classification algorithms
print('Scores of Logistic Regression is',fit_logistic_regression())
print('Scores of Linear Discriminant Analysis is',fit_lda())
print('Scores of K Nearest Neighbor is',fit_knn())
print('Scores of Decision Tree is',fit_decision_tree())
print('Scores of Gaussian Naive Bayes is',fit_GaussianNB())
print('Scores of Random Forest Classifier is',fit_random_forest())

Scores of Logistic Regression is (0.9445655728769214, '              precision    recall  f1-score   support\n\n           0       0.92      0.97      0.95     55147\n           1       0.97      0.91      0.94     54929\n\n    accuracy                           0.94    110076\n   macro avg       0.95      0.94      0.94    110076\nweighted avg       0.95      0.94      0.94    110076\n')
Scores of Linear Discriminant Analysis is (0.9154856644500163, '              precision    recall  f1-score   support\n\n           0       0.86      0.99      0.92     55147\n           1       0.98      0.84      0.91     54929\n\n    accuracy                           0.92    110076\n   macro avg       0.92      0.92      0.92    110076\nweighted avg       0.92      0.92      0.92    110076\n')
Scores of K Nearest Neighbor is (0.9992823140375741, '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     55147\n           1       1.00      1.00      1.