# NOTA
Este script debe ejecutarse en un cluster All-purpose o Job Cluster

In [0]:
!pip install Faker

In [0]:
%pip install mysql-connector-python

In [0]:
dbutils.widgets.text("jdbc_url","jdbc:mysql://www.bigdataybi.com/farmafake?useSSL=false&allowPublicKeyRetrieval=true")
dbutils.widgets.text("db_user","")
dbutils.widgets.text("db_password","")
jdbc_url = dbutils.widgets.get("jdbc_url")
db_user = dbutils.widgets.get("db_user")
db_password = dbutils.widgets.get("db_password")

In [0]:
from faker import Faker
import random
import uuid
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType,
    DoubleType, TimestampType, ArrayType
)
from pyspark.sql import functions as F

def generar_clientes(num_registros:int, fake:Faker):
    def generar_cedula():
        # Dos primeros dígitos: provincia (01–24)
        provincia = random.randint(1, 24)
        cedula = f"{provincia:02d}{random.randint(100000000, 999999999)}"
        return cedula[:10]

    def generar_ruc():
        cedula = generar_cedula()
        return cedula + "001"

    clientes = []
    for i in range(1, num_registros + 1):
        nombre = fake.name()
        genero = random.choice(['masculino', 'femenino'])
        
        # customer_number: 80% cédula, 20% RUC
        if random.random() < 0.8:
            customer_number = generar_cedula()
        else:
            customer_number = generar_ruc()

        # Email: 20% inválido o nulo
        if random.random() < 0.2:
            email = None if random.random() < 0.5 else fake.word()
        else:
            email = fake.email()

        # Teléfono: 30% nulo
        if random.random() < 0.3:
            telefono = None
        else:
            telefono = f'09{random.randint(10000000, 99999999)}'

        # Fecha de nacimiento: 5% nulo
        if random.random() < 0.05:
            fecha_nacimiento = None
        else:
            fecha_nacimiento = fake.date_of_birth(minimum_age=18, maximum_age=80).isoformat()

        clientes.append({
            'customer_id': i,
            'customer_name': nombre,
            'email': email,
            'telephone': telefono,
            'genero': genero,
            'date_birthday': fecha_nacimiento,
            'customer_number': customer_number
        })

    df = spark.createDataFrame(clientes)
    return df


def generar_tienda(fake:Faker):
    coordenadas = [
        {"lat": -0.19083301189289498, "lon": -78.4684678293547},
        {"lat": -0.9743910742811162, "lon": -80.6720813210836},
        {"lat": -1.0036132054724964, "lon": -80.62787944051743},
    ]
    data = []
    for i, coord in enumerate(coordenadas, start=1):
        data.append({
            "store_id": i,
            "store_name": f"{fake.company()}",
            "store_lat": coord["lat"],
            "store_lon": coord["lon"]
        })
    df = spark.createDataFrame(data)
    return df


def generar_productos(num_registros:int, fake:Faker):
    categorias = [
        'Analgésicos', 'Antibióticos', 'Antiinflamatorios', 'Antialérgicos',
        'Antigripales', 'Antisépticos', 'Vitaminas', 'Antipiréticos'
    ]
    marcas = ['Pfizer', 'Bayer', 'Novartis', 'Roche', 'Sanofi', 'GSK', 'Merck', 'Abbott']
    productos = []
    for i in range(1, num_registros + 1):
        categoria = random.choice(categorias)
        marca = random.choice(marcas)
        modelo = fake.bothify(text='MOD-####')
        nombre = fake.unique.word().capitalize() + f" {categoria}"
        costo_unitario = round(random.uniform(2, 100), 2)
        descuento = round(random.uniform(0, 0.3), 2)  # hasta 30% de descuento

        productos.append({
            'product_id': i,
            'product_name': nombre,
            'category': categoria,
            'brand': marca,
            'model': modelo,
            'cost_unit': costo_unitario,
            'discount': descuento
        })
    df = spark.createDataFrame(productos)
    return df


def generar_factura(num_registros: int, fake: Faker, customer_df, product_df, store_df):
    # Obtener IDs de clientes, tiendas y productos
    customer_ids = [row['customer_id'] for row in customer_df.select('customer_id').collect()]
    store_ids = [row['store_id'] for row in store_df.select('store_id').collect()]
    productos = [(row['product_id'], row['cost_unit']) for row in product_df.select('product_id', 'cost_unit').collect()]
    
    facturas = []
    for _ in range(num_registros):
        # Generar UUID único para cada factura
        doc_id = str(uuid.uuid4())
        
        # 90% FA, 10% NC
        doc_type = 'FA' if random.random() < 0.9 else 'NC'
        secuencial = f"{random.randint(1, 999999999):09d}"
        doc_code = f"001-010-{secuencial}"
        customer_id = random.choice(customer_ids)
        # 2% de los casos con store_id = NULL
        store_id = None if random.random() < 0.02 else random.choice(store_ids)
        doc_date = fake.date_time_between(start_date='-2y', end_date='now')
        doc_state = random.choice(['A', 'I'])
        
        # Generar detalles (1 a 5 productos)
        num_detalles = random.randint(1, 5)
        detalles = []
        subtotal = 0.0
        descuento_total = 0.0

        for _ in range(num_detalles):
            product_id, cost_unit = random.choice(productos)
            quantity = random.randint(1, 10)
            unit_price = round(cost_unit * random.uniform(1.05, 1.15), 2)
            discount_pct = round(random.uniform(0, 12), 2)
            detalle_total = quantity * unit_price
            descuento_total += detalle_total * (discount_pct / 100)
            subtotal += detalle_total

            detalles.append({
                'detail_id': str(uuid.uuid4()),
                'product_id': product_id,
                'quantity': quantity,
                'unit_price': unit_price,
                'discount_percent': discount_pct
            })

        # Totales
        doc_subtotal = round(subtotal, 2)
        doc_discount = round(descuento_total, 2)
        doc_total = round(doc_subtotal - doc_discount, 2)

        facturas.append({
            'doc_id': doc_id,
            'doc_code': doc_code,
            'doc_type': doc_type,
            'store_id': store_id,
            'customer_id': customer_id,
            'doc_subtotal': doc_subtotal,
            'doc_total': doc_total,
            'doc_discount': doc_discount,
            'doc_date': doc_date,
            'doc_state': doc_state,
            'details': detalles
        })

    # Schema con doc_id como String
    details_schema = ArrayType(StructType([
        StructField('detail_id', StringType(), True),
        StructField('product_id', IntegerType(), True),
        StructField('quantity', IntegerType(), True),
        StructField('unit_price', DoubleType(), True),
        StructField('discount_percent', DoubleType(), True)
    ]))

    schema = StructType([
        StructField('doc_id', StringType(), True),
        StructField('doc_code', StringType(), True),
        StructField('doc_type', StringType(), True),
        StructField('store_id', IntegerType(), True),
        StructField('customer_id', IntegerType(), True),
        StructField('doc_subtotal', DoubleType(), True),
        StructField('doc_total', DoubleType(), True),
        StructField('doc_discount', DoubleType(), True),
        StructField('doc_date', TimestampType(), True),
        StructField('doc_state', StringType(), True),
        StructField('details', details_schema, True)
    ])

    df = spark.createDataFrame(facturas, schema=schema)
    return df


def load_full(jdbc_url:str, db_user:str, db_password:str, fake:Faker, customer_df, store_df, products_df):
    invoice_df = generar_factura(10, fake, customer_df, products_df, store_df)
    print(f"[INFO] Writing customers")
    customer_df.write.jdbc(url=jdbc_url, table='customers', mode='overwrite',
                           properties={'user': db_user, 'password': db_password})
    print(f"[INFO] Writing stores")
    store_df.write.jdbc(url=jdbc_url, table='stores', mode='overwrite',
                        properties={'user': db_user, 'password': db_password})
    print(f"[INFO] Writing products")
    products_df.write.jdbc(url=jdbc_url, table='products', mode='overwrite',
                           properties={'user': db_user, 'password': db_password})
    print(f"[INFO] Writing invoices header")
    header_df = invoice_df.select(
        'doc_id', 'doc_code', 'doc_type', 'store_id', 'customer_id',
        'doc_subtotal', 'doc_total', 'doc_discount', 'doc_date', 'doc_state',
        F.from_utc_timestamp(F.current_timestamp(),'America/Guayaquil').alias("_writetime")
    )
    header_df.write.jdbc(url=jdbc_url, table='invoice_header', mode='overwrite',
                         properties={'user': db_user, 'password': db_password})
    print(f"[INFO] Writing invoice details")
    details_df = invoice_df.select('doc_id', F.explode('details').alias('details')).select('doc_id', 'details.*')\
        .withColumn("_writetime",F.from_utc_timestamp(F.current_timestamp(),'America/Guayaquil'))
    details_df.write.jdbc(url=jdbc_url, table='invoice_details', mode='overwrite',
                          properties={'user': db_user, 'password': db_password})
    

def load_incremental(jdbc_url: str, db_user: str, db_password: str, fake:Faker, customer_df, product_df, store_df):
    fake = Faker('es_ES')
    # Generar entre 1 y 5 nuevas facturas
    n_nuevas = random.randint(1, 5)
    print(f"[INFO] Generando {n_nuevas} nuevas facturas incrementales...")
    invoice_df = generar_factura(n_nuevas, fake, customer_df, product_df, store_df)
    # Cabecera (header)
    header_df = invoice_df.select(
        F.col('doc_id').cast('string').alias('doc_id'),
        'doc_code', 'doc_type', 'store_id', 'customer_id',
        'doc_subtotal', 'doc_total', 'doc_discount', 'doc_date', 'doc_state', 
        F.from_utc_timestamp(F.current_timestamp(),'America/Guayaquil').alias("_writetime")
    )
    # Detalles (detail)
    details_df = invoice_df.select(
        F.col('doc_id').cast('string').alias('doc_id'),
        F.explode('details').alias('details')
    ).select(
        'doc_id',
        'details.detail_id',
        'details.product_id',
        'details.quantity',
        'details.unit_price',
        'details.discount_percent',
        F.from_utc_timestamp(F.current_timestamp(),'America/Guayaquil').alias("_writetime")
    )
    # Escribir (modo append → inserta sin borrar)
    print(f"[INFO] Insertando nuevas filas en invoice_header...")
    header_df.write.jdbc(
        url=jdbc_url, table='invoice_header', mode='append',
        properties={'user': db_user, 'password': db_password}
    )
    print(f"[INFO] Insertando nuevas filas en invoice_details...")
    details_df.write.jdbc(
        url=jdbc_url, table='invoice_details', mode='append',
        properties={'user': db_user, 'password': db_password}
    )
    print(f"[SUCCESS] Se insertaron {n_nuevas} facturas nuevas correctamente.")

# Carga de datos
La función load_full hace la primera caga de datos a la base creando tablas y estructuras

In [0]:
jdbc_url = dbutils.widgets.get("jdbc_url")
db_user = dbutils.widgets.get("db_user")
db_password = dbutils.widgets.get("db_password")
fake = Faker('es_ES')
customer_df = generar_clientes(100, fake)
store_df = generar_tienda(fake)
products_df = generar_productos(50, fake)
load_full(jdbc_url, db_user, db_password, fake, customer_df, store_df, products_df)

La función load_incremental inserta nuevos registros cada vez que se ejecuta

In [0]:
load_incremental(jdbc_url, db_user, db_password, fake, customer_df, products_df, store_df)