In [121]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [122]:
# URL del archivo en el bucket público de AWS
url = "https://hybridge-education-machine-learning-datasets.s3.us-east-1.amazonaws.com/Fraud.csv"
df = pd.read_csv(url)

In [123]:
df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [135]:
len(df)

6362620

# ¡¡¡6,362,620 OBSERVACIONES!!!

In [136]:
df_copy = df.copy()
#copiamos el dataframe para no modificar el original
df_copy.isnull().sum() #contamos si hay valores vacios

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [137]:
#EDA
fraude_info= df_copy[df_copy['isFraud'] == 1].groupby('type').count()
print(fraude_info)
#podemos ver que la mayoria de fraudes son del tipo 'TRANSFER' y 'CASH_OUT' podemos filtrar del dataframe original solo estos tipos de transacciones para entrenar nuestro modelo
df_copy = df_copy[df_copy['type'].isin(['TRANSFER', 'CASH_OUT'])]


print('transacciones Payment o debit, ', df_copy['type'].isin(['PAYMENT', 'DEBIT']).sum())  #verificamos que no haya transacciones de tipo PAYMENT o DEBIT


          step  amount  nameOrig  oldbalanceOrg  newbalanceOrig  nameDest  \
type                                                                        
CASH_OUT  4116    4116      4116           4116            4116      4116   
TRANSFER  4097    4097      4097           4097            4097      4097   

          oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
type                                                               
CASH_OUT            4116            4116     4116            4116  
TRANSFER            4097            4097     4097            4097  
transacciones Payment o debit,  0


In [138]:
#Drop columnas que no sirven (nameOrig, nameDest, step, isFlaggedFraud)
df_copy.head()
df_copy = df_copy.drop(['nameOrig', 'nameDest', 'step', 'isFlaggedFraud'], axis=1)
df_copy.head()





Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
15,CASH_OUT,229133.94,15325.0,0.0,5083.0,51513.44,0
19,TRANSFER,215310.3,705.0,0.0,22425.0,0.0,0
24,TRANSFER,311685.89,10835.0,0.0,6267.0,2719172.89,0


In [139]:
#para el typo de transacción type hacemos one hot encoding
df_copy = pd.get_dummies(df_copy, columns=['type'], drop_first=True)
df_copy['type_TRANSFER'] = df_copy['type_TRANSFER'].astype(int)
df_copy.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_TRANSFER
2,181.0,181.0,0.0,0.0,0.0,1,1
3,181.0,181.0,0.0,21182.0,0.0,1,0
15,229133.94,15325.0,0.0,5083.0,51513.44,0,0
19,215310.3,705.0,0.0,22425.0,0.0,0,1
24,311685.89,10835.0,0.0,6267.0,2719172.89,0,1


In [140]:

#separar la variable X y Y
X = df_copy.drop('isFraud', axis=1)
Y = df_copy['isFraud']
X.info()
Y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2770409 entries, 2 to 6362619
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   amount          float64
 1   oldbalanceOrg   float64
 2   newbalanceOrig  float64
 3   oldbalanceDest  float64
 4   newbalanceDest  float64
 5   type_TRANSFER   int64  
dtypes: float64(5), int64(1)
memory usage: 148.0 MB
<class 'pandas.core.series.Series'>
Index: 2770409 entries, 2 to 6362619
Series name: isFraud
Non-Null Count    Dtype
--------------    -----
2770409 non-null  int64
dtypes: int64(1)
memory usage: 42.3 MB


In [141]:
#separamos en datos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [142]:
# escalar características, con numeros grandes como amount
scaler = StandardScaler()
to_escalate= ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest','newbalanceDest']
X_train.loc[:,to_escalate] = scaler.fit_transform(X_train[to_escalate])
X_test.loc[:,to_escalate] = scaler.transform(X_test[to_escalate])

In [146]:
#entrenar el modelo de regresión logística
model = LogisticRegression(random_state=42, max_iter=100,class_weight='balanced')
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [147]:
#predicciones
Y_pred_train = model.predict(X_train)
Y_pred_test = model.predict(X_test)


In [148]:
#evaluar el modelo, recall, precision, f1-score com classification report
print("Reporte de clasificación - Conjunto de Entrenamiento")
print(classification_report(Y_train, Y_pred_train))
print("Reporte de clasificación - Conjunto de Prueba")
print(classification_report(Y_test, Y_pred_test))


Reporte de clasificación - Conjunto de Entrenamiento
              precision    recall  f1-score   support

           0       1.00      0.93      0.97   2209760
           1       0.04      0.88      0.07      6567

    accuracy                           0.93   2216327
   macro avg       0.52      0.91      0.52   2216327
weighted avg       1.00      0.93      0.96   2216327

Reporte de clasificación - Conjunto de Prueba
              precision    recall  f1-score   support

           0       1.00      0.93      0.97    552436
           1       0.04      0.90      0.07      1646

    accuracy                           0.93    554082
   macro avg       0.52      0.91      0.52    554082
weighted avg       1.00      0.93      0.96    554082

