In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("PracticeJoins") \
    .master("local[*]") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

In [5]:
# Sample data creation
customers_data = [
    (1, "John Doe", "john@email.com", "New York"),
    (2, "Jane Smith", "jane@email.com", "California"),
    (3, "Bob Johnson", "bob@email.com", "Texas"),
    (4, "Alice Brown", "alice@email.com", "Florida")
]

orders_data = [
    (101, 1, "2023-01-15", 250.00, "completed"),
    (102, 2, "2023-01-16", 180.50, "completed"),
    (103, 1, "2023-01-17", 320.75, "pending"),
    (104, 3, "2023-01-18", 95.25, "completed"),
    (105, 2, "2023-01-19", 450.00, "cancelled")
]

order_items_data = [
    (1, 101, 201, 2, 50.00),
    (2, 101, 202, 1, 150.00),
    (3, 102, 201, 3, 50.00),
    (4, 102, 203, 1, 30.50),
    (5, 103, 202, 2, 150.00),
    (6, 103, 204, 1, 20.75),
    (7, 104, 201, 1, 50.00),
    (8, 104, 205, 2, 22.63),
    (9, 105, 202, 3, 150.00)
]

products_data = [
    (201, "Laptop", "Electronics", 1200.00),
    (202, "Headphones", "Electronics", 150.00),
    (203, "Book", "Education", 30.50),
    (204, "Notebook", "Office", 20.75),
    (205, "Pen", "Office", 22.63)
]

categories_data = [
    ("Electronics", "Tech products and gadgets"),
    ("Education", "Books and learning materials"),
    ("Office", "Office supplies and stationery")
]

# Create DataFrames
customers = spark.createDataFrame(customers_data, 
    ["customer_id", "name", "email", "state"])

orders = spark.createDataFrame(orders_data, 
    ["order_id", "customer_id", "order_date", "total_amount", "status"])

order_items = spark.createDataFrame(order_items_data, 
    ["item_id", "order_id", "product_id", "quantity", "unit_price"])

products = spark.createDataFrame(products_data, 
    ["product_id", "product_name", "category", "list_price"])

categories = spark.createDataFrame(categories_data, 
    ["category_name", "description"])

In [6]:
df_j = customers.join(orders, "customer_id", how="inner")
df_j.show(10, truncate=False)

+-----------+-----------+--------------+----------+--------+----------+------------+---------+
|customer_id|name       |email         |state     |order_id|order_date|total_amount|status   |
+-----------+-----------+--------------+----------+--------+----------+------------+---------+
|1          |John Doe   |john@email.com|New York  |101     |2023-01-15|250.0       |completed|
|1          |John Doe   |john@email.com|New York  |103     |2023-01-17|320.75      |pending  |
|2          |Jane Smith |jane@email.com|California|102     |2023-01-16|180.5       |completed|
|2          |Jane Smith |jane@email.com|California|105     |2023-01-19|450.0       |cancelled|
|3          |Bob Johnson|bob@email.com |Texas     |104     |2023-01-18|95.25       |completed|
+-----------+-----------+--------------+----------+--------+----------+------------+---------+



In [8]:
df_j2 = customers.alias("c").join(orders.alias("o"), 
    (col("c.customer_id") == col("o.customer_id")) & (col("o.order_id") != lit(102)),
     how="left")
df_j2.show(10, truncate=False)

+-----------+-----------+---------------+----------+--------+-----------+----------+------------+---------+
|customer_id|name       |email          |state     |order_id|customer_id|order_date|total_amount|status   |
+-----------+-----------+---------------+----------+--------+-----------+----------+------------+---------+
|1          |John Doe   |john@email.com |New York  |103     |1          |2023-01-17|320.75      |pending  |
|1          |John Doe   |john@email.com |New York  |101     |1          |2023-01-15|250.0       |completed|
|2          |Jane Smith |jane@email.com |California|105     |2          |2023-01-19|450.0       |cancelled|
|3          |Bob Johnson|bob@email.com  |Texas     |104     |3          |2023-01-18|95.25       |completed|
|4          |Alice Brown|alice@email.com|Florida   |NULL    |NULL       |NULL      |NULL        |NULL     |
+-----------+-----------+---------------+----------+--------+-----------+----------+------------+---------+



In [10]:
import random

def random_int(min_value, max_value, size):
    return [random.randint(min_value, max_value) for _ in range(size)]

udf_random_int = udf(random_int, ArrayType(IntegerType()))

df_order = orders.withColumn("random_id", udf_random_int(lit(1), lit(100), lit(4)))
df_order.show(truncate=False)


+--------+-----------+----------+------------+---------+----------------+
|order_id|customer_id|order_date|total_amount|status   |random_id       |
+--------+-----------+----------+------------+---------+----------------+
|101     |1          |2023-01-15|250.0       |completed|[37, 29, 89, 61]|
|102     |2          |2023-01-16|180.5       |completed|[12, 76, 37, 24]|
|103     |1          |2023-01-17|320.75      |pending  |[50, 99, 56, 16]|
|104     |3          |2023-01-18|95.25       |completed|[82, 60, 41, 38]|
|105     |2          |2023-01-19|450.0       |cancelled|[96, 9, 27, 99] |
+--------+-----------+----------+------------+---------+----------------+



In [12]:

@udf(IntegerType())
def array_sum(arr):
    val = 0
    for item in arr:
        val += item
    return val

In [13]:
df_order.withColumn("random_sum", array_sum(col("random_id"))).show(10, truncate=False)

+--------+-----------+----------+------------+---------+----------------+----------+
|order_id|customer_id|order_date|total_amount|status   |random_id       |random_sum|
+--------+-----------+----------+------------+---------+----------------+----------+
|101     |1          |2023-01-15|250.0       |completed|[28, 86, 35, 41]|317       |
|102     |2          |2023-01-16|180.5       |completed|[74, 73, 54, 42]|154       |
|103     |1          |2023-01-17|320.75      |pending  |[88, 28, 82, 64]|206       |
|104     |3          |2023-01-18|95.25       |completed|[1, 32, 28, 74] |124       |
|105     |2          |2023-01-19|450.0       |cancelled|[45, 2, 72, 62] |236       |
+--------+-----------+----------+------------+---------+----------------+----------+

