# Data Exploration

In [None]:
import random 
import xgboost

import numpy as np 
import pandas as pd 
import plotly.express as px
import seaborn as sns

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from pandas_datareader import data
from scipy import stats


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import matplotlib.ticker as ticker
from matplotlib.ticker import FixedFormatter, FixedLocator
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

pd.set_option('display.max_rows', 100)

In [None]:
df = pd.read_csv("../input/creditcardfraud/creditcard.csv")
cm = sns.light_palette("green", as_cmap=True)
df.head(30).style.background_gradient(cmap=cm)

In [None]:
pd.DataFrame(df.columns, columns=["name"])

In [None]:
pd.DataFrame(df.dtypes, columns=["type"])

In [None]:
df.loc[:, df.columns!='Time'].describe().style.background_gradient(cmap=cm)

# Data visualizations

In [None]:
def get_random_color():
    r1 = lambda: random.randint(0,255)
    return '#%02X%02X%02X' % (r1(),r1(),r1())


def get_histplot_central_tendency(df: dict, fields: list):
    for field in fields:
        f, (ax1) = plt.subplots(1, 1, figsize=(9, 4))
        v_dist_1 = df[field].values
        sns.histplot(v_dist_1, ax=ax1, color=get_random_color(), kde=True)

        mean=df[field].mean()
        median=df[field].median()
        mode=df[field].mode().values[0]

        ax1.axvline(mean, color='r', linestyle='--', label="Mean")
        ax1.axvline(median, color='g', linestyle='-', label="Mean")
        ax1.axvline(mode, color='b', linestyle='-', label="Mode")
        ax1.legend()
        plt.grid()
        plt.title(f"{field} - Histogram analysis")

In [None]:
features_v = [f"V{i}" for i in range(1, 29)]
get_histplot_central_tendency(df, features_v)

In [None]:
pd.DataFrame(df["Class"].value_counts())

In [None]:
pd.DataFrame(df.isnull().sum().sort_values(ascending=False), columns=["count"]).style.background_gradient(cmap=cm)

# Correlation headmap

In [None]:
def get_headmap_credit(df: dict):
    corr = df.corr()
    plt.figure(figsize=(35, 35))
    sns.heatmap(corr, annot=True, cmap="YlGnBu", linewidths=0.1, annot_kws={"fontsize":10})
    plt.title("Correlation house prices - return rate")


In [None]:
get_headmap_credit(df)

In [None]:
df.plot.scatter(x="Amount", y='Class', color=get_random_color())

In [None]:
from sklearn import preprocessing

def preprocessor_min_max(df: dict, fields: list):
    for field in fields:
        min_max = preprocessing.MinMaxScaler()
        scaled = min_max.fit_transform(df[field].values.reshape(-1, 1))
        df[f"MinMax_{field}"] = scaled
    return df

In [None]:
df = preprocessor_min_max(df, features_v)

In [None]:
df.head(10)

In [None]:
def get_boxplot_price(df: dict, fields: list):
    for field in fields:
        f, ax = plt.subplots(figsize=(6, 4))
        fig = sns.boxplot(y=f"MinMax_{field}", data=df, color=get_random_color())
        plt.xticks(rotation=90)
        plt.title(f"Boxplot - {field}")
        plt.show()

get_boxplot_price(df, features_v)

In [None]:
from imblearn.over_sampling import SMOTE as Smote

y = df['Class']
X = df[features_v + ["Amount"]]

def unbalanced_smote(X_train, y_train):
    return Smote().fit_resample(X_train, y_train)


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
x_train, y_train = unbalanced_smote(x_train, y_train)
print(x_train.shape, y_train.shape)

In [None]:
y_train.value_counts().plot(kind='bar', color='orange')
plt.title('Target count')

# Model training and evaluate

In [None]:
from xgboost import plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

def logistic_regression():
    return {"LR": LogisticRegression(solver="lbfgs")}

def random_forest():
    return {"random_forest": RandomForestClassifier(criterion='gini', max_depth=None, max_features=8, max_leaf_nodes=None,
                                                    n_estimators=100)}
def xgboost():
    return {"xgboost": XGBClassifier(use_label_encoder=False,learning_rate=0.3, n_estimators=300, max_depth=14)}



default_model = xgboost()["xgboost"] 
default_model.fit(x_train, y_train,eval_metric='rmse')

# Features importance

In [None]:
plt.rcParams["figure.figsize"] = (10, 8)
plot_importance(default_model, max_num_features=100)
plt.show()

# Fit models

In [None]:
#models = {}
#models.update(xgboost(use_label_encoder=False))
#models.update(random_forest())
#models.update(logistic_regression())

#print(models)

In [None]:
from sklearn.metrics import log_loss

model_value = {"model": [], "log_loss": [], "acc": []}
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score_loss = log_loss(y_test, model.predict_proba(x_test))
    score_acc = accuracy_score(y_test, y_pred)
    model_value["model"].append(name)
    model_value["log_loss"].append(score_loss)
    model_value["acc"].append(score_acc)

In [None]:
pd.DataFrame(model_value)