In [None]:
# -----------------------------------------------------------------------------
# 환경 설정: SparkSession 및 테스트 데이터 준비
# -----------------------------------------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, when, count, sum, avg, min, max,
    countDistinct, first, last, collect_list, collect_set,
    round as spark_round, expr
)
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
import numpy as np
import os

# SparkSession 생성
spark = SparkSession.builder \
    .appName("PySpark-Aggregation-Join") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", 10) \
    .getOrCreate()

# 테스트 데이터 디렉토리
os.makedirs("/tmp/spark_tutorial", exist_ok=True)

np.random.seed(42)

# -----------------------------------------------------------------------------
# 테스트 데이터 1: 직원 정보
# -----------------------------------------------------------------------------
employees = pd.DataFrame({
    "emp_id": [f"E{i:03d}" for i in range(1, 51)],
    "name": [f"Employee_{i}" for i in range(1, 51)],
    "department": np.random.choice(
        ["Engineering", "Sales", "Marketing", "HR", "Finance"], 50
    ),
    "salary": np.random.randint(40000, 120000, 50),
    "age": np.random.randint(25, 55, 50),
    "hire_year": np.random.choice([2020, 2021, 2022, 2023, 2024], 50),
})
employees.to_csv("/tmp/spark_tutorial/employees.csv", index=False)

# -----------------------------------------------------------------------------
# 테스트 데이터 2: 부서 정보 (조인용)
# -----------------------------------------------------------------------------
departments = pd.DataFrame({
    "dept_name": ["Engineering", "Sales", "Marketing", "HR", "Finance", "Legal"],
    "dept_head": ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank"],
    "budget": [500000, 300000, 200000, 150000, 400000, 100000],
    "location": ["Seoul", "Busan", "Seoul", "Daegu", "Seoul", "Incheon"],
})
departments.to_csv("/tmp/spark_tutorial/departments.csv", index=False)

# -----------------------------------------------------------------------------
# 테스트 데이터 3: 매출 데이터 (시계열)
# -----------------------------------------------------------------------------
sales = pd.DataFrame({
    "date": pd.date_range("2026-01-01", periods=100, freq="D").strftime("%Y-%m-%d"),
    "product": np.random.choice(["A", "B", "C"], 100),
    "region": np.random.choice(["Seoul", "Busan", "Daegu"], 100),
    "amount": np.random.randint(100, 1000, 100),
    "quantity": np.random.randint(1, 20, 100),
})
sales.to_csv("/tmp/spark_tutorial/sales.csv", index=False)

# DataFrame 로드
df_emp = spark.read.csv("/tmp/spark_tutorial/employees.csv", header=True, inferSchema=True)
df_dept = spark.read.csv("/tmp/spark_tutorial/departments.csv", header=True, inferSchema=True)
df_sales = spark.read.csv("/tmp/spark_tutorial/sales.csv", header=True, inferSchema=True)

print("데이터 로드 완료!")
print(f"직원: {df_emp.count()}명, 부서: {df_dept.count()}개, 매출: {df_sales.count()}건")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/21 00:26:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


데이터 로드 완료!
직원: 50명, 부서: 6개, 매출: 100건


26/01/21 00:27:03 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


#### 윈도우 집계 (Window Aggregation)