In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline

from preprocess import cargar_y_preprocesar_datos
from utils import print_score

plt.style.use('ggplot')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train, X_val, X_test, y_train, y_val, y_test = cargar_y_preprocesar_datos("./data/credit_card_fraud_dataset.csv")

#### Modelo SVM Base

In [3]:
# Modelo SVM base con class_weight='balanced' para manejar el desbalance
svm_base = SVC(
    kernel='rbf',
    class_weight='balanced',
    probability=True,
    random_state=42
)

svm_base.fit(X_train, y_train)

print_score(svm_base, X_train, y_train, X_val, y_val, train=True)
print_score(svm_base, X_train, y_train, X_val, y_val, train=False)

Train Result:
Accuracy Score: 52.67%
_______________________________________________
CLASSIFICATION REPORT:
                      0           1  accuracy     macro avg  weighted avg
precision      0.992720    0.012998  0.526667      0.502859      0.982923
recall         0.525741    0.618333  0.526667      0.572037      0.526667
f1-score       0.687424    0.025462  0.526667      0.356443      0.680805
support    59400.000000  600.000000  0.526667  60000.000000  60000.000000
_______________________________________________
Confusion Matrix: 
 [[31229 28171]
 [  229   371]]

Test Result:
Accuracy Score: 52.46%
_______________________________________________
CLASSIFICATION REPORT:
                      0           1  accuracy     macro avg  weighted avg
precision      0.989723    0.009693   0.52465      0.499708      0.979923
recall         0.525303    0.460000   0.52465      0.492652      0.524650
f1-score       0.686331    0.018987   0.52465      0.352659      0.679657
support    19800.00

#### SVM con SMOTE

In [4]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Distribución original en entrenamiento:")
print(y_train.value_counts())
print(f"\nDistribución después de SMOTE:")
print(pd.Series(y_train_smote).value_counts())

svm_smote = SVC(
    kernel='rbf',
    probability=True,
    random_state=42
) 

svm_smote.fit(X_train_smote, y_train_smote)

print_score(svm_smote, X_train_smote, y_train_smote, X_val, y_val, train=True)
print_score(svm_smote, X_train_smote, y_train_smote, X_val, y_val, train=False)

Distribución original en entrenamiento:
IsFraud
0    59400
1      600
Name: count, dtype: int64

Distribución después de SMOTE:
IsFraud
0    59400
1    59400
Name: count, dtype: int64
Train Result:
Accuracy Score: 83.84%
_______________________________________________
CLASSIFICATION REPORT:
                      0             1  accuracy      macro avg   weighted avg
precision      0.755696      1.000000  0.838359       0.877848       0.877848
recall         1.000000      0.676717  0.838359       0.838359       0.838359
f1-score       0.860851      0.807193  0.838359       0.834022       0.834022
support    59400.000000  59400.000000  0.838359  118800.000000  118800.000000
_______________________________________________
Confusion Matrix: 
 [[59400     0]
 [19203 40197]]

Test Result:
Accuracy Score: 99.00%
_______________________________________________
CLASSIFICATION REPORT:
                      0      1  accuracy     macro avg  weighted avg
precision      0.990000    0.0      0.99  

#### Optimización de Hiperparámetros

In [None]:
# espacio de búsqueda de hiperparámetros
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Usar StratifiedKFold para validación cruzada
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

svm_grid = GridSearchCV(
    SVC(class_weight='balanced', probability=True, random_state=42),
    param_grid=param_grid,
    cv=cv,
    scoring='f1', 
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train, y_train)

print("\nMejores parámetros encontrados:")
print(svm_grid.best_params_)
print(f"\nMejor score F1 en validación cruzada: {svm_grid.best_score_:.4f}")

best_svm = svm_grid.best_estimator_

Fitting 3 folds for each of 40 candidates, totalling 120 fits


##### Evaluación del Modelo Optimizado

In [None]:
print_score(best_svm, X_train, y_train, X_val, y_val, train=True)
print_score(best_svm, X_train, y_train, X_val, y_val, train=False)

y_pred_val = best_svm.predict(X_val)
y_proba_val = best_svm.predict_proba(X_val)[:, 1]

print(f"\nMétricas adicionales en validación:")
print(f"ROC-AUC Score: {roc_auc_score(y_val, y_proba_val):.4f}")

precision, recall, _ = precision_recall_curve(y_val, y_proba_val)
pr_auc = auc(recall, precision)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(recall, precision, label=f'SVM Optimizado (AUC = {pr_auc:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.hist(y_proba_val[y_val == 0], bins=50, alpha=0.7, label='No Fraude', density=True)
plt.hist(y_proba_val[y_val == 1], bins=50, alpha=0.7, label='Fraude', density=True)
plt.xlabel('Probabilidad de Fraude')
plt.ylabel('Densidad')
plt.title('Distribución de Probabilidades')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

##### Evaluación Final en Conjunto de Prueba

In [None]:
print_score(best_svm, X_train, y_train, X_test, y_test, train=False)

# Métricas adicionales en conjunto de prueba
y_pred_test = best_svm.predict(X_test)
y_proba_test = best_svm.predict_proba(X_test)[:, 1]

# Calcular métricas específicas
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
roc_auc = roc_auc_score(y_test, y_proba_test)

# Curva Precision-Recall para conjunto de prueba
precision_test, recall_test, _ = precision_recall_curve(y_test, y_proba_test)
pr_auc_test = auc(recall_test, precision_test)

metrics_df = pd.DataFrame({
    'Métrica': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC'],
    'Valor': [accuracy, precision, recall, f1, roc_auc, pr_auc_test]
})

print("\nResumen de métricas en conjunto de prueba:")
print(metrics_df.to_string(index=False))

print("\n" + "="*50)
print("INTERPRETACIÓN DE RESULTADOS")
print("="*50)
print("\nPara detección de fraude, las métricas más importantes son:")
print(f"• Recall: {recall:.3f} - Porcentaje de fraudes reales detectados")
print(f"• Precision: {precision:.3f} - Porcentaje de alertas que son fraudes reales")
print(f"• F1-Score: {f1:.3f} - Balance entre precision y recall")
print(f"• ROC-AUC: {roc_auc:.3f} - Capacidad de discriminación general")
print(f"• PR-AUC: {pr_auc_test:.3f} - Rendimiento en clases desbalanceadas")

##### Comparación de Modelos

In [None]:
# Comparar todos los modelos en el conjunto de validación
models = {
    'SVM Base': svm_base,
    'SVM + SMOTE': svm_smote,
    'SVM Optimizado': best_svm
}

comparison_results = []

for name, model in models.items():
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    
    metrics = {
        'Modelo': name,
        'Accuracy': accuracy_score(y_val, y_pred),
        'Precision': precision_score(y_val, y_pred),
        'Recall': recall_score(y_val, y_pred),
        'F1-Score': f1_score(y_val, y_pred),
        'ROC-AUC': roc_auc_score(y_val, y_proba)
    }
    comparison_results.append(metrics)

comparison_df = pd.DataFrame(comparison_results)
print("\nComparación de modelos en conjunto de validación:")
print(comparison_df.round(4).to_string(index=False))

# Visualización de la comparación
plt.figure(figsize=(12, 8))

metrics_to_plot = ['Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (_, row) in enumerate(comparison_df.iterrows()):
    values = [row[metric] for metric in metrics_to_plot]
    plt.bar(x + i*width, values, width, label=row['Modelo'])

plt.xlabel('Métricas')
plt.ylabel('Valor')
plt.title('Comparación de Modelos SVM')
plt.xticks(x + width, metrics_to_plot)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()