In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("local[*]")  # Fuerza modo local
    .appName("CreditCardFraudDetection_EDA")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .getOrCreate()
)

spark


In [4]:
data_path = "../data/creditcard.csv"

df = spark.read.csv(
    data_path,
    header=True,       # Usa la primera fila como cabecera
    inferSchema=True   # Intenta adivinar los tipos
)

df.printSchema()
df.show(5)


root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [5]:
num_rows = df.count()
num_cols = len(df.columns)

print(f"Número de filas: {num_rows}")
print(f"Número de columnas: {num_cols}")
print("Columnas:", df.columns)


Número de filas: 284807
Número de columnas: 31
Columnas: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


In [6]:
from pyspark.sql.functions import col

df.groupBy("Class").count().show()


+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [7]:
df.select("Time", "Amount").describe().show()


+-------+-----------------+------------------+
|summary|             Time|            Amount|
+-------+-----------------+------------------+
|  count|           284807|            284807|
|   mean|94813.85957508067| 88.34961925093077|
| stddev|47488.14595456596|250.12010924018867|
|    min|              0.0|               0.0|
|    max|         172792.0|          25691.16|
+-------+-----------------+------------------+



In [8]:
from pyspark.sql.functions import isnan, when, count

null_counts = df.select([
    count(
        when(
            col(c).isNull() | isnan(col(c)),
            c
        )
    ).alias(c) for c in df.columns
])

null_counts.show()


+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|Time| V1| V2| V3| V4| V5| V6| V7| V8| V9|V10|V11|V12|V13|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|V26|V27|V28|Amount|Class|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|   0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|     0|    0|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+



In [9]:
from pyspark.sql.types import IntegerType, DoubleType

# Castear la columna Class a entero
df_clean = df.withColumn("Class", col("Class").cast(IntegerType()))

# Asegura que las columnas numéricas son Double (por si acaso)
numeric_cols = [c for c in df_clean.columns if c != "Class"]

for c in numeric_cols:
    df_clean = df_clean.withColumn(c, col(c).cast(DoubleType()))

df_clean.printSchema()
df_clean.show(5)


root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [10]:
df_clean.groupBy("Class").count().show()


+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [12]:
print("Número de filas en df_clean:", df_clean.count())
df_clean.show(5)

print("Preprocesado completado. Usa el mismo código de carga y limpieza en el notebook 02.")



Número de filas en df_clean: 284807
+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|          