In [0]:
!pip install Faker

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType
from faker import Faker
import random

# Crear sesión Spark
spark = SparkSession.builder.appName("FakerComplexData").getOrCreate()

fake = Faker("es_ES")

data = []
for i in range(10):
    direccion = {
        "calle": fake.street_name(),
        "ciudad": fake.city(),
        "pais": fake.country()
    }

    # 🔁 Generar números base
    base_phones = [fake.phone_number() for _ in range(random.randint(1, 2))]

    # 🔁 Repetir algunos números y agregar nulos
    telefonos = []
    for _ in range(random.randint(1, 5)):
        choice = random.choice(["repeat", "null", "new"])
        if choice == "repeat":
            telefonos.append(random.choice(base_phones))
        elif choice == "null":
            telefonos.append(None)
        else:
            telefonos.append(fake.phone_number())

    # 🛒 Compras
    compras = [
        {"producto": fake.word(), "precio": round(random.uniform(10, 500), 2)}
        for _ in range(random.randint(1, 4))
    ]

    data.append({
        "id": i + 1,
        "nombre": fake.name(),
        "edad": random.randint(18, 70),
        "direccion": direccion,
        "telefonos": telefonos,
        "compras": compras
    })

# 🧱 Esquema del DataFrame
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("nombre", StringType(), True),
    StructField("edad", IntegerType(), True),
    StructField("direccion", StructType([
        StructField("calle", StringType(), True),
        StructField("ciudad", StringType(), True),
        StructField("pais", StringType(), True)
    ])),
    StructField("telefonos", ArrayType(StringType())),
    StructField("compras", ArrayType(
        StructType([
            StructField("producto", StringType(), True),
            StructField("precio", DoubleType(), True)
        ])
    ))
])

# 🚀 Crear el DataFrame
df = spark.createDataFrame(data, schema=schema)

df.createOrReplaceTempView("customer_shop")

In [0]:
old_orders = [
    {"_id":1,"product":"Televisor","quantity":1,"price":345.32},
    {"_id":2,"product":"Refrigerador","quantity":1,"price":533.01},
    {"_id":3,"product":"Sofa","quantity":2,"price":200.78},
    {"_id":4,"product":"Silla","quantity":4,"price":20.44},
    {"_id":5,"product":"Mesa","quantity":1,"price":120.99}
]
spark.sql("DROP TABLE IF EXISTS old_orders")
spark.createDataFrame(old_orders).write.mode("overwrite").saveAsTable("old_orders")
new_orders = [
    {"_id":1,"product":"Televisor","quantity":1,"price":345.32},
    {"_id":2,"product":"Refrigerador","quantity":1,"price":533.01},
    {"_id":3,"product":"Sofa","quantity":2,"price":200.78},
    {"_id":4,"product":"Silla de playa","quantity":3,"price":35.44},
    {"_id":5,"product":"Sombrilla","quantity":1,"price":10.99}
]
spark.sql("DROP TABLE IF EXISTS new_orders")
spark.createDataFrame(new_orders).write.mode("overwrite").saveAsTable("new_orders")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from faker import Faker
import random
fake = Faker("es_ES")
productos = ["Laptop", "Smartphone", "Auriculares", "Teclado", "Monitor"]
dias_semana = ["1_LUN", "2_MAR", "3_MIE", "4_JUE", "5_VIE", "6_SAB", "7_DOM"]
data = []
for _ in range(100):
    producto = random.choice(productos)
    dia = random.choice(dias_semana)
    valor_venta = round(random.uniform(50, 1500), 2)
    data.append((dia, producto, valor_venta))

# === Definir esquema ===
schema = StructType([
    StructField("dia_semana", StringType(), True),
    StructField("producto", StringType(), True),
    StructField("valor_venta", DoubleType(), True)
])

# === Crear DataFrame ===
df_ventas = spark.createDataFrame(data, schema=schema)

# === Registrar vista temporal (opcional) ===
spark.sql("DROP TABLE IF EXISTS ventas")
df_ventas.write.mode("overwrite").saveAsTable("ventas")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from faker import Faker
import random

fake = Faker("es_ES")

# Generar datos falsos
data = []
for _ in range(10):
    region = fake.city()
    producto = random.choice(["Laptop", "Smartphone", "Monitor", "Auriculares", "Teclado"])
    ventas_enero = round(random.uniform(1000, 5000), 2)
    ventas_febrero = round(random.uniform(1000, 5000), 2)
    ventas_marzo = round(random.uniform(1000, 5000), 2)
    data.append((region, producto, ventas_enero, ventas_febrero, ventas_marzo))

# Definir esquema
schema = StructType([
    StructField("region", StringType(), True),
    StructField("producto", StringType(), True),
    StructField("ventas_enero", DoubleType(), True),
    StructField("ventas_febrero", DoubleType(), True),
    StructField("ventas_marzo", DoubleType(), True)
])

# Crear DataFrame
df = spark.createDataFrame(data, schema=schema)
df.write.mode("overwrite").saveAsTable("sales_month")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from faker import Faker
import random

# Inicializar Faker
fake = Faker("es_ES")
dominios = ["gmail.com", "outlook.com", "yahoo.com", "hotmail.com", "icloud.com", "company.com", "fundacion.org", "universidad.edu", "banco.fin"]
# === Generar datos fake ===
data = []
for i in range(1, 21):
    nombre = fake.first_name()
    apellido = fake.last_name()

    # Crear correo con dominio real
    dominio = random.choice(dominios)
    correo = f"{nombre.lower()}.{apellido.lower()}@{dominio}"

    # Dirección como struct
    direccion = {
        "calle": fake.street_address(),
        "ciudad": fake.city(),
        "pais": fake.country()
    }

    data.append({
        "customer_id": str(i),
        "nombre": nombre,
        "apellido": apellido,
        "email": correo,
        "telefono": fake.phone_number(),
        "empresa": fake.company(),
        "cargo": random.choice(["Gerente", "Analista", "Director", "Vendedor", "Asistente"]),
        "direccion": direccion,
        "fecha_registro": str(fake.date_between(start_date='-2y', end_date='today'))
    })

# === Definir esquema ===
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("customer_id", StringType(), False),
    StructField("nombre", StringType(), True),
    StructField("apellido", StringType(), True),
    StructField("email", StringType(), True),
    StructField("telefono", StringType(), True),
    StructField("empresa", StringType(), True),
    StructField("cargo", StringType(), True),
    StructField("direccion", StructType([
        StructField("calle", StringType(), True),
        StructField("ciudad", StringType(), True),
        StructField("pais", StringType(), True)
    ])),
    StructField("fecha_registro", StringType(), True)
])

# === Crear DataFrame ===
customers = spark.createDataFrame(data, schema=schema)

# Registrar vista temporal para SQL
spark.sql("DROP TABLE IF EXISTS customers")
customers.write.mode("overwrite").saveAsTable("customers")



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType
from faker import Faker
import random
from datetime import datetime

fake = Faker("es_ES")

# =======================
# 1️⃣ DATAFRAME: CUSTOMERS
# =======================
customers_data = []
for i in range(1, 11):  # 10 clientes
    customers_data.append({
        "customer_id": str(i),
        "email": f"{fake.first_name().lower()}.{fake.last_name().lower()}@{random.choice(['gmail.com','yahoo.com','outlook.com'])}",
        "profile": random.choice(["Regular", "Premium", "Gold"]),
        "updated": str(fake.date_between(start_date='-1y', end_date='today'))
    })

customers_schema = StructType([
    StructField("customer_id", StringType(), False),
    StructField("email", StringType(), True),
    StructField("profile", StringType(), True),
    StructField("updated", StringType(), True)
])

customers = spark.createDataFrame(customers_data, schema=customers_schema)

# =======================
# 2️⃣ DATAFRAME: BOOKS
# =======================
categorias = ["Ficción", "Ciencia", "Historia", "Romance", "Fantasía", "Autoayuda"]
books_data = []
for i in range(1, 16):  # 15 libros
    books_data.append({
        "book_id": str(i),
        "title": fake.sentence(nb_words=3).replace(".", ""),
        "author": f"{fake.first_name()} {fake.last_name()}",
        "category": random.choice(categorias),
        "price": round(random.uniform(10, 100), 2)
    })

books_schema = StructType([
    StructField("book_id", StringType(), False),
    StructField("title", StringType(), True),
    StructField("author", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DoubleType(), True)
])

books = spark.createDataFrame(books_data, schema=books_schema)

# =======================
# 3️⃣ DATAFRAME: ORDERS
# =======================
orders_data = []
for i in range(1, 21):  # 20 pedidos
    customer = random.choice(customers_data)
    n_books = random.randint(1, 3)
    selected_books = random.sample(books_data, n_books)

    order_books = [b["book_id"] for b in selected_books]
    total = sum(b["price"] for b in selected_books)
    quantity = n_books

    orders_data.append({
        "order_id": str(i),
        "order_date": str(fake.date_between(start_date='-6M', end_date='today')),
        "customer_id": customer["customer_id"],
        "quantity": quantity,
        "total": round(total, 2),
        "books": order_books
    })

orders_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("order_date", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("total", DoubleType(), True),
    StructField("books", ArrayType(StringType()), True)
])

orders = spark.createDataFrame(orders_data, schema=orders_schema)


# Crear vistas temporales
customers.write.mode("overwrite").saveAsTable("customer")
books.write.mode("overwrite").saveAsTable("book")
orders.write.mode("overwrite").saveAsTable("order")