In [0]:
!pip install faker

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from faker import Faker
import pandas as pd
import random

# =====================
# Parámetros
# =====================
NUM_CLIENTES = 3000
NUM_FACTURAS = 1000000
TABLA_CLIENTES = "clientes"
TABLA_FACTURACION = "facturacion"

# =====================
# Inicializar Faker
# =====================
fake = Faker()
Faker.seed(42)

# =====================
# Generar DataFrame de clientes
# =====================
clientes_data = []
cliente_ids = set()

while len(cliente_ids) < NUM_CLIENTES:
    cid = fake.unique.random_int(min=1000, max=9999)
    cliente_ids.add(cid)

for cid in cliente_ids:
    cliente_nombre = fake.name()
    clientes_data.append((cid, cliente_nombre))

clientes_df = pd.DataFrame(clientes_data, columns=["cliente_id", "cliente_nombre"])

# =====================
# Generar DataFrame de facturación
# =====================
facturacion_data = []

cliente_id_list = list(cliente_ids)
for _ in range(NUM_FACTURAS):
    cliente_id = random.choice(cliente_id_list)
    categoria = random.choice(["Electrónica", "Moda", "Hogar", "Deportes", "Libros"])
    fecha_factura = fake.date_between(start_date='-3y', end_date='today')
    monto = round(random.uniform(10, 1000), 2)
    facturacion_data.append((cliente_id, categoria, fecha_factura, monto))

facturacion_df = pd.DataFrame(facturacion_data, columns=["cliente_id", "categoria", "fecha_factura", "monto"])

# =====================
# Convertir a DataFrames de Spark
# =====================
df_clientes = spark.createDataFrame(clientes_df)
df_facturacion = spark.createDataFrame(facturacion_df)

df = df_clientes.join(df_facturacion, on="cliente_id", how="inner")
df.write.mode("overwrite").saveAsTable(TABLA_FACTURACION)

In [0]:
from pyspark.sql import SparkSession
from faker import Faker
import pandas as pd
import random

# =====================
# Parámetros
# =====================
NUM_REGISTROS = 100000

faker = Faker()
Faker.seed(123)

# =====================
# Generar datos con errores
# =====================
data = []
for _ in range(NUM_REGISTROS):
    nombre = faker.name()
    # 15% chance de nulo
    email = faker.email() if random.random() > 0.15 else None
    # 15% chance de string vacío
    telefono = faker.phone_number() if random.random() > 0.15 else ""
    # 15% chance de nulo
    direccion = faker.address().replace("\n", ", ") if random.random() > 0.15 else None
    data.append((nombre, email, telefono, direccion))
# Agregar 5 registros duplicados
data += random.choices(data, k=5)
# =====================
# Crear DataFrame de Pandas y luego en Spark
# =====================
pdf = pd.DataFrame(data, columns=["nombre", "email", "telefono", "direccion"])
df = spark.createDataFrame(pdf)
# =====================
# Mostrar resultados
# =====================
# Opcional: Guardar como tabla o archivo
df.write.mode("overwrite").saveAsTable("cliente")