In [0]:
dbutils.widgets.text("volume_path", "")


In [0]:
volume_path = dbutils.widgets.get("volume_path")
if volume_path == "":
  raise Exception("Please provide a volume path")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType
from faker import Faker
import random
from datetime import datetime
import uuid

In [0]:

def load_new_data():
    fake = Faker("es_ES")
    # =======================
    # 1️⃣ DATAFRAME: CUSTOMERS
    # =======================
    customers_data = []
    for i in range(1, 11):  # 10 clientes
        customers_data.append({
            "customer_id": str(i),
            "email": f"{fake.first_name().lower()}.{fake.last_name().lower()}@{random.choice(['gmail.com','yahoo.com','outlook.com'])}",
            "profile": random.choice(["Regular", "Premium", "Gold"]),
            "updated": str(fake.date_between(start_date='-1y', end_date='today'))
        })

    customers_schema = StructType([
        StructField("customer_id", StringType(), False),
        StructField("email", StringType(), True),
        StructField("profile", StringType(), True),
        StructField("updated", StringType(), True)
    ])

    customers = spark.createDataFrame(customers_data, schema=customers_schema)

    # =======================
    # 2️⃣ DATAFRAME: BOOKS
    # =======================
    categorias = ["Ficción", "Ciencia", "Historia", "Romance", "Fantasía", "Autoayuda"]
    books_data = []
    for i in range(1, 16):  # 15 libros
        books_data.append({
            "book_id": str(i),
            "title": fake.sentence(nb_words=3).replace(".", ""),
            "author": f"{fake.first_name()} {fake.last_name()}",
            "category": random.choice(categorias),
            "price": round(random.uniform(10, 100), 2)
        })

    books_schema = StructType([
        StructField("book_id", StringType(), False),
        StructField("title", StringType(), True),
        StructField("author", StringType(), True),
        StructField("category", StringType(), True),
        StructField("price", DoubleType(), True)
    ])

    books = spark.createDataFrame(books_data, schema=books_schema)

    # =======================
    # 3️⃣ DATAFRAME: ORDERS
    # =======================
    orders_data = []
    number_of_orders = random.randint(1,5)
    for i in range(1, number_of_orders): 
        customer = random.choice(customers_data)
        n_books = random.randint(1, 3)
        selected_books = random.sample(books_data, n_books)

        order_books = [b["book_id"] for b in selected_books]
        total = sum(b["price"] for b in selected_books)
        quantity = n_books

        orders_data.append({
            "order_id": str(uuid.uuid4()),
            "order_date": str(fake.date_between(start_date='-6M', end_date='today')),
            "customer_id": customer["customer_id"],
            "quantity": quantity,
            "total": round(total, 2),
            "books": order_books
        })

    orders_schema = StructType([
        StructField("order_id", StringType(), False),
        StructField("order_date", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("total", DoubleType(), True),
        StructField("books", ArrayType(StringType()), True)
    ])

    orders = spark.createDataFrame(orders_data, schema=orders_schema)

    orders.repartition(1).write.mode("append").format("parquet").save(f"{volume_path}/orders")


In [0]:
load_new_data()