In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Data Manipulation
datos = pd.read_csv("creditcard.csv")
print(datos.head())

In [None]:
nr_classes = datos["Class"].value_counts(sort=True)
print(nr_classes)

In [None]:
nr_classes.plot(kind="bar", rot=0)
plt.xticks(range(2),["Normal", "Fraudulento"])
plt.title("Distribución de los Datos")
plt.xlabel("Clases")
plt.ylabel("Cantidad")
plt.show()

In [None]:
normales=datos[datos.Class==0]
fraudulentos= datos[datos.Class==1]
plt.scatter(normales.Time/3600, normales.Amount, alpha=0.5, c="blue", label="Normales", s=3)
plt.scatter(fraudulentos.Time/3600, fraudulentos.Amount, alpha= 0.5, c="red", label="Fraudulentos", s=3)
plt.xlabel("Tiempo desde la primera transaccion (h)")
plt.ylabel("Cantidad (euros)")
plt.legend(loc="upper right")
plt.show()

In [None]:
import numpy as np

bins = np.linspace(200,2500,100)
plt.hist(normales.Amount, bins, alpha=1, density= True, label= "Normales", color="blue")
plt.hist(fraudulentos.Amount, bins, alpha=0.6, density=True, label="Fraudulentos", color="red")
plt.legend(loc="upper right")
plt.xlabel("Cantidad (euros")
plt.ylabel("Porcentaje de transacciones (%)")
plt.show()

In [None]:
import matplotlib.gridspec as gs
import matplotlib.colors
import seaborn as sns

var = datos.iloc[:,1:29].columns
grid = gs.GridSpec(28,1)
for i, cn in enumerate(datos[var]):
    sns.displot(data = datos, x= cn, hue= "Class", bins=50, stat="density", common_norm=False)
    plt.title("Historagrama Caracteristica: " + str(cn))
    plt.show()

In [None]:
#Procesamiento de Datos

from sklearn.preprocessing import StandardScaler

datos.drop(["Time"], axis=1, inplace=True)
datos["Amount"]=StandardScaler().fit_transform(datos["Amount"].values.reshape(-1,1))

In [None]:
#Train y Test

from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(datos,test_size=0.2, random_state=42)
x_train = x_train[x_train.Class == 0]
x_train = x_train.drop(["Class"], axis=1)
x_train = x_train.values
y_test = x_test["Class"]
x_test = x_test.drop(["Class"], axis = 1)
x_test = x_test.values

In [None]:
np.random.seed(5)

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD

dim_entrada = x_train.shape[1]

capa_entrada = Input(shape=(dim_entrada, ))
encoder = Dense(20, activation="tanh")(capa_entrada)
encoder = Dense(14, activation="relu")(encoder)

decoder = Dense(20, activation="tanh")(encoder)
decoder = Dense(29, activation="relu")(decoder)

autoencoder = Model(inputs= capa_entrada, outputs = decoder)
sgd = SGD(learning_rate=0.01)




In [None]:
#Fit del Modelo
autoencoder.compile(optimizer="sgd", loss="mse")
autoencoder.fit(x_train,x_train, epochs=100, batch_size=32, shuffle=True, validation_data=(x_test, x_test), verbose=1)

In [None]:
#Predicciones de x_test, Matriz de Confusion, Error
x_pred= autoencoder.predict(x_test)
mse=np.mean(np.power(x_test-x_pred, 2), axis=1)
print(x_pred.shape)

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve

precision, recall, umbral = precision_recall_curve(y_test, mse)

plt.plot(umbral, precision[1:], label= "Precision")
plt.plot(umbral, recall[1:], label="Recall")
plt.title("Precicion y Recall para distintos Umbrales")
plt.xlabel("Umbral")
plt.ylabel("Precision/Recall")
plt.legend()
plt.show()

In [None]:
#No nos importa tener falsos positivos pero no debemos tener falsos negativos, por encima de precision y por de recall
umbral = 0.75
y_pred=[1 if e > umbral else 0 for e in mse] # Por cada valor de error, prediccion es normal o fraudulento
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
