In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from plotly import express as px

from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

In [4]:
def load_datasest(path):
    return pd.read_csv(path)
df = load_datasest('https://storage.googleapis.com/qwasar-public/track-ds/my_paypal_creditcard.csv')

In [None]:
def summarize_dataset(df):
    print("Dataset's shape: ", df.shape)
    print("Summarize datasest:")
    df.info()
summarize_dataset(df)

In [6]:
def clean_dataset(df):
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    return df
df = clean_dataset(df)

In [None]:
fig = px.scatter(df,
                 x = 'Time',
                 y = 'Amount',
                 color = 'Class',
                 facet_col= 'Class',
                 template = 'plotly_dark'
                 )

fig.show()

In [None]:
fig = px.imshow(df.corr().abs())
fig.update_layout(height=1000, width=1000)
fig.show()

In [None]:
fig = px.histogram(df, x=df['Class'].astype(str), color='Class', text_auto=True)
fig.update_layout(width=600, height=600)
fig.show()

In [None]:
fig = px.histogram(df, x='Time')
fig.show()

In [None]:
amount = df['Amount'].value_counts()
amount = amount[amount>500]
fig = px.histogram(amount, x=amount.index, y=amount.values)
fig.show()

# Preprocessing for Machine Learning

In [12]:
x = df.drop('Class', axis=1)
y = df['Class']

In [18]:
smote = SMOTE(random_state=42)

x_balanced, y_balanced = smote.fit_resample(x, y)

In [19]:
x_balanced['Class'] = y_balanced
sample = x_balanced.sample(n=10000, random_state=42)

In [20]:
try:
    del x_balanced, y_balanced, df
except:
    pass

In [21]:
x = sample.drop('Class', axis=1)
y = sample['Class']

In [None]:
fig = px.histogram(sample, x=sample['Class'].astype(str), color='Class', text_auto=True)
fig.update_layout(width=600, height=600)
fig.show()

In [23]:
x.shape

(10000, 30)

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

# Machine Learning

In [25]:
def confusion_matrix(y_true, y_pred):
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    fig = px.imshow(conf_matrix, text_auto=True)
    fig.update_layout(height=300, width=300, margin=dict(t=10, b=10, l=10, r=10), coloraxis_showscale=False)
    fig.show()

In [26]:
def fit_model(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('Model name:', str(model))
    print(f'Accurance score: {int(metrics.accuracy_score(y_test, y_pred) * 100)}%')
    print(f"Mean squared error: {round(metrics.mean_squared_error(y_test, y_pred), 2) * 100}%")
    print(f"Cross Validation Score: {int(np.mean(cross_val_score(model, x_train, y_train, cv=10)) * 100)}%")
    print(f"Confusion Matrix: \n")
    confusion_matrix(y_test, y_pred)

In [None]:
model = LogisticRegression()
fit_model(model)

In [None]:
model = DecisionTreeClassifier()
fit_model(model)

In [None]:
model = RandomForestClassifier()
fit_model(model)

In [None]:
model = SVC()
fit_model(model)

In [None]:
model = KNeighborsClassifier()
fit_model(model)