In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Crear la sesión de Spark
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("CreditCardFraudDetection_Modeling")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)


In [2]:
data_path = "../data/creditcard.csv"

df = spark.read.csv(
    data_path,
    header=True,
    inferSchema=True
)

df.printSchema()
df.show(5)

# Castear Class a entero y asegurar tipos Double en el resto
df_clean = df.withColumn("Class", col("Class").cast(IntegerType()))

numeric_cols = [c for c in df_clean.columns if c != "Class"]
for c in numeric_cols:
    df_clean = df_clean.withColumn(c, col(c).cast(DoubleType()))

df_clean.printSchema()
df_clean.groupBy("Class").count().show()


root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [3]:
feature_cols = [c for c in df_clean.columns if c != "Class"]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw"
)

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=False
)

train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"Train: {train_df.count()} filas")
print(f"Test: {test_df.count()} filas")


Train: 228225 filas
Test: 56582 filas


In [4]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print(f"Train: {train_df.count()} filas")
print(f"Test: {test_df.count()} filas")


Train: 228225 filas
Test: 56582 filas


In [5]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="Class",
    maxIter=10,
    regParam=0.0,
    elasticNetParam=0.0
)

pipeline_lr = Pipeline(stages=[assembler, scaler, lr])


In [6]:
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="Class",
    numTrees=100,
    maxDepth=5,
    seed=42
)

pipeline_rf = Pipeline(stages=[assembler, scaler, rf])


In [7]:
# Logistic Regression
lr_model = pipeline_lr.fit(train_df)

# Random Forest
rf_model = pipeline_rf.fit(train_df)


In [8]:
lr_preds = lr_model.transform(test_df)
rf_preds = rf_model.transform(test_df)

lr_preds.select("features", "Class", "probability", "prediction").show(5)
rf_preds.select("features", "Class", "probability", "prediction").show(5)


+--------------------+-----+--------------------+----------+
|            features|Class|         probability|prediction|
+--------------------+-----+--------------------+----------+
|[2.10452018766574...|    0|[0.99952732223518...|       0.0|
|[8.41808075066298...|    0|[0.99968052218950...|       0.0|
|[1.47316413136602...|    0|[0.99963368134858...|       0.0|
|[2.31497220643232...|    0|[0.99911553370910...|       0.0|
|[3.15678028149861...|    0|[0.99993092221894...|       0.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows
+--------------------+-----+--------------------+----------+
|            features|Class|         probability|prediction|
+--------------------+-----+--------------------+----------+
|[2.10452018766574...|    0|[0.99969931267151...|       0.0|
|[8.41808075066298...|    0|[0.99979458353204...|       0.0|
|[1.47316413136602...|    0|[0.99973234791809...|       0.0|
|[2.31497220643232...|    0|[0.99968782396745...|       0.0|


In [9]:
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="Class",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

multi_evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction",
    metricName="accuracy"
)

multi_evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction",
    metricName="f1"
)


In [10]:
lr_auc = binary_evaluator.evaluate(lr_preds)
lr_accuracy = multi_evaluator_accuracy.evaluate(lr_preds)
lr_f1 = multi_evaluator_f1.evaluate(lr_preds)

print("=== Logistic Regression ===")
print(f"AUC-ROC : {lr_auc:.4f}")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"F1      : {lr_f1:.4f}")


=== Logistic Regression ===
AUC-ROC : 0.9569
Accuracy: 0.9992
F1      : 0.9991


In [11]:
rf_auc = binary_evaluator.evaluate(rf_preds)
rf_accuracy = multi_evaluator_accuracy.evaluate(rf_preds)
rf_f1 = multi_evaluator_f1.evaluate(rf_preds)

print("=== Random Forest ===")
print(f"AUC-ROC : {rf_auc:.4f}")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"F1      : {rf_f1:.4f}")


=== Random Forest ===
AUC-ROC : 0.9504
Accuracy: 0.9992
F1      : 0.9992


In [12]:
results = [
    ("Logistic Regression", lr_auc, lr_accuracy, lr_f1),
    ("Random Forest",       rf_auc, rf_accuracy, rf_f1)
]

for name, auc, acc, f1 in results:
    print(f"{name:20s}  AUC: {auc:.4f}  Acc: {acc:.4f}  F1: {f1:.4f}")


Logistic Regression   AUC: 0.9569  Acc: 0.9992  F1: 0.9991
Random Forest         AUC: 0.9504  Acc: 0.9992  F1: 0.9992


In [13]:
results = [
    ("Logistic Regression", lr_auc, lr_accuracy, lr_f1),
    ("Random Forest",       rf_auc, rf_accuracy, rf_f1)
]

print("=== COMPARATIVA DE MÉTRICAS ===")
print("Modelo                 AUC       ACC       F1")
for name, auc, acc, f1 in results:
    print(f"{name:20s} {auc:.4f}   {acc:.4f}   {f1:.4f}")


=== COMPARATIVA DE MÉTRICAS ===
Modelo                 AUC       ACC       F1
Logistic Regression  0.9569   0.9992   0.9991
Random Forest        0.9504   0.9992   0.9992
