In [1]:
"hello"

'hello'

In [2]:
# -----------------------------------------------------------------------------
# 환경 설정: SparkSession 생성 및 데이터 로드
# -----------------------------------------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, upper, lower, concat, substring
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType
import pandas as pd
import numpy as np
import os

# SparkSession 생성 (이전 교시에서 이미 있으면 재사용)
spark = SparkSession.builder \
    .appName("PySpark-Column-Filter") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", 10) \
    .getOrCreate()

# 테스트 데이터 생성 (이전 교시 데이터 없으면 새로 생성)
np.random.seed(42)

os.makedirs("/tmp/spark_tutorial", exist_ok=True)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/20 08:12:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/20 08:12:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# 샘플 데이터
sample_data = pd.DataFrame({
    "emp_id": [f"E{i:03d}" for i in range(1, 101)],
    "name": [f"Employee_{i}" for i in range(1, 101)],
    "department": np.random.choice(
        ["Engineering", "Sales", "Marketing", "HR", "Finance"], 100
    ),
    "salary": np.random.randint(40000, 120000, 100),
    "age": np.random.randint(25, 55, 100),
    "is_manager": np.random.choice([True, False], 100, p=[0.2, 0.8]),
})

In [4]:
# 결측치 추가
sample_data.loc[5:10, "salary"] = None
sample_data.loc[15:18, "department"] = None

sample_data.to_csv("/tmp/spark_tutorial/employees.csv", index=False)

In [None]:
# DataFrame 로드
df = spark.read.csv("/tmp/spark_tutorial/employees.csv", header=True, inferSchema=True)

print("데이터 로드 완료!")
print(f"행 수: {df.count()}, 컬럼: {df.columns}")
df.show(5)

데이터 로드 완료!
행 수: 100, 컬럼: ['emp_id', 'name', 'department', 'salary', 'age', 'is_manager']
+------+----------+----------+--------+---+----------+
|emp_id|      name|department|  salary|age|is_manager|
+------+----------+----------+--------+---+----------+
|  E001|Employee_1|        HR| 92251.0| 28|     false|
|  E002|Employee_2|   Finance| 62662.0| 43|     false|
|  E003|Employee_3| Marketing| 48392.0| 50|     false|
|  E004|Employee_4|   Finance| 70535.0| 27|     false|
|  E005|Employee_5|   Finance|118603.0| 43|     false|
+------+----------+----------+--------+---+----------+
only showing top 5 rows



26/01/20 08:13:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
# -----------------------------------------------------------------------------
# 컬럼 참조 방법 비교
# -----------------------------------------------------------------------------
from pyspark.sql.functions import col

# 방법 1: 문자열 (가장 단순)
# select, groupBy 등에서 컬럼명만 필요할 때
df.select("name", "salary").show(3)

# 방법 2: col() 함수 (가장 권장)
# 연산이나 조건식에서 사용
# col("컬럼명"): Column 객체 반환 → 연산 가능
df.select(col("name"), col("salary") * 1.1).show(3)

# 방법 3: df.컬럼명 (조인 시 유용)
# 여러 DataFrame을 다룰 때 어떤 테이블의 컬럼인지 명확히
df.select(df.name, df.salary).show(3)

+----------+-------+
|      name| salary|
+----------+-------+
|Employee_1|92251.0|
|Employee_2|62662.0|
|Employee_3|48392.0|
+----------+-------+
only showing top 3 rows

+----------+------------------+
|      name|    (salary * 1.1)|
+----------+------------------+
|Employee_1|          101476.1|
|Employee_2| 68928.20000000001|
|Employee_3|53231.200000000004|
+----------+------------------+
only showing top 3 rows

+----------+-------+
|      name| salary|
+----------+-------+
|Employee_1|92251.0|
|Employee_2|62662.0|
|Employee_3|48392.0|
+----------+-------+
only showing top 3 rows



In [7]:
# -----------------------------------------------------------------------------
# col() vs 문자열: 연산 가능 여부
# -----------------------------------------------------------------------------

# 문자열은 연산 불가 (에러 발생)
# df.select("salary" * 1.1)  # TypeError!

# col()은 연산 가능
# col("salary"): salary 컬럼을 Column 객체로 반환
# * 1.1: 모든 값에 1.1 곱하기
df.select(
    col("name"),                    # 이름 그대로
    col("salary"),                  # 원래 급여
    col("salary") * 1.1             # 급여 10% 인상
).show(5)

+----------+--------+------------------+
|      name|  salary|    (salary * 1.1)|
+----------+--------+------------------+
|Employee_1| 92251.0|          101476.1|
|Employee_2| 62662.0| 68928.20000000001|
|Employee_3| 48392.0|53231.200000000004|
|Employee_4| 70535.0|           77588.5|
|Employee_5|118603.0|130463.30000000002|
+----------+--------+------------------+
only showing top 5 rows



In [None]:
#실습 1-1: 컬럼 참조 방법 비교
#name과 age 컬럼을 col() 함수를 사용하여 선택하세요.
df.select(col("name"), col("age")).show(5)

+-----------+---+
|       name|age|
+-----------+---+
| Employee_1| 28|
| Employee_2| 43|
| Employee_3| 50|
| Employee_4| 27|
| Employee_5| 43|
| Employee_6| 44|
| Employee_7| 31|
| Employee_8| 44|
| Employee_9| 33|
|Employee_10| 25|
|Employee_11| 32|
|Employee_12| 31|
|Employee_13| 42|
|Employee_14| 32|
|Employee_15| 25|
|Employee_16| 35|
|Employee_17| 52|
|Employee_18| 49|
|Employee_19| 49|
|Employee_20| 42|
+-----------+---+
only showing top 20 rows



In [11]:
#salary 컬럼에 2를 곱한 결과를 조회하세요.
df.select(col("name"), col("salary"), col("salary") * 2).show(5)

+----------+--------+------------+
|      name|  salary|(salary * 2)|
+----------+--------+------------+
|Employee_1| 92251.0|    184502.0|
|Employee_2| 62662.0|    125324.0|
|Employee_3| 48392.0|     96784.0|
|Employee_4| 70535.0|    141070.0|
|Employee_5|118603.0|    237206.0|
+----------+--------+------------+
only showing top 5 rows



In [None]:
#df.name과 df.department를 사용하여 두 컬럼을 조회하세요.
df.select(df.name, df.department).show(5)

+-----------+----------+
|       name|department|
+-----------+----------+
| Employee_1|        HR|
| Employee_2|   Finance|
| Employee_3| Marketing|
| Employee_4|   Finance|
| Employee_5|   Finance|
| Employee_6|     Sales|
| Employee_7| Marketing|
| Employee_8| Marketing|
| Employee_9| Marketing|
|Employee_10|   Finance|
|Employee_11|        HR|
|Employee_12| Marketing|
|Employee_13|   Finance|
|Employee_14|     Sales|
|Employee_15|        HR|
|Employee_16|      NULL|
|Employee_17|      NULL|
|Employee_18|      NULL|
|Employee_19|      NULL|
|Employee_20|        HR|
+-----------+----------+
only showing top 20 rows



###컬럼 선택 (select)

In [36]:
# 기본 형태
df.select("col1", "col2", "col3")
df.select(col("col1"), col("col2"))

# 모든 컬럼
df.select("*")

# 연산과 함께
df.select("name", (col("salary") * 1.1).alias("new_salary"))

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `col1` cannot be resolved. Did you mean one of the following? [`age`, `name`, `salary`, `emp_id`, `department`].;
'Project ['col1, 'col2, 'col3]
+- Relation [emp_id#17,name#18,department#19,salary#20,age#21,is_manager#22] csv


In [18]:
# -----------------------------------------------------------------------------
# select(): 기본 사용법
# -----------------------------------------------------------------------------

# 단일 컬럼 선택
# 결과: 해당 컬럼만 포함된 새 DataFrame 반환
df.select("name").show(5)

# 여러 컬럼 선택
# 쉼표로 구분하여 나열
df.select("emp_id", "name", "department").show(5)

+----------+
|      name|
+----------+
|Employee_1|
|Employee_2|
|Employee_3|
|Employee_4|
|Employee_5|
+----------+
only showing top 5 rows

+------+----------+----------+
|emp_id|      name|department|
+------+----------+----------+
|  E001|Employee_1|        HR|
|  E002|Employee_2|   Finance|
|  E003|Employee_3| Marketing|
|  E004|Employee_4|   Finance|
|  E005|Employee_5|   Finance|
+------+----------+----------+
only showing top 5 rows



In [19]:
# -----------------------------------------------------------------------------
# select(): 컬럼 연산과 별칭(alias)
# -----------------------------------------------------------------------------

# alias(): 컬럼에 새 이름 부여
# (col("salary") * 1.1).alias("raised_salary"): 계산 결과에 이름 지정
result = df.select(
    col("name"),                                    # 이름 그대로
    col("salary"),                                  # 원래 급여
    (col("salary") * 1.1).alias("raised_salary"),  # 10% 인상 급여 (새 이름)
    (col("salary") / 12).alias("monthly_salary")   # 월급 (새 이름)
)

print("=== 급여 계산 ===")
result.show(5)

=== 급여 계산 ===
+----------+--------+------------------+------------------+
|      name|  salary|     raised_salary|    monthly_salary|
+----------+--------+------------------+------------------+
|Employee_1| 92251.0|          101476.1| 7687.583333333333|
|Employee_2| 62662.0| 68928.20000000001| 5221.833333333333|
|Employee_3| 48392.0|53231.200000000004|4032.6666666666665|
|Employee_4| 70535.0|           77588.5| 5877.916666666667|
|Employee_5|118603.0|130463.30000000002| 9883.583333333334|
+----------+--------+------------------+------------------+
only showing top 5 rows



In [20]:
# -----------------------------------------------------------------------------
# select(): 컬럼 순서 변경, 특정 컬럼 제외
# -----------------------------------------------------------------------------

# 컬럼 순서 변경: select에 원하는 순서로 나열
reordered = df.select("department", "name", "salary", "emp_id")
print("=== 컬럼 순서 변경 ===")
reordered.show(3)

# 특정 컬럼 제외: drop() 사용 (select의 반대)
# drop(): 지정한 컬럼을 제외한 나머지 반환
without_manager = df.drop("is_manager")
print("=== is_manager 컬럼 제외 ===")
without_manager.show(3)

=== 컬럼 순서 변경 ===
+----------+----------+-------+------+
|department|      name| salary|emp_id|
+----------+----------+-------+------+
|        HR|Employee_1|92251.0|  E001|
|   Finance|Employee_2|62662.0|  E002|
| Marketing|Employee_3|48392.0|  E003|
+----------+----------+-------+------+
only showing top 3 rows

=== is_manager 컬럼 제외 ===
+------+----------+----------+-------+---+
|emp_id|      name|department| salary|age|
+------+----------+----------+-------+---+
|  E001|Employee_1|        HR|92251.0| 28|
|  E002|Employee_2|   Finance|62662.0| 43|
|  E003|Employee_3| Marketing|48392.0| 50|
+------+----------+----------+-------+---+
only showing top 3 rows



In [21]:
# -----------------------------------------------------------------------------
# select(): 동적 컬럼 선택 (리스트 활용)
# -----------------------------------------------------------------------------

# 컬럼 목록을 변수로 관리
cols_to_select = ["emp_id", "name", "salary"]

# * 연산자로 리스트 언패킹
# *cols_to_select: ["a", "b"] → "a", "b"로 풀어줌
df.select(*cols_to_select).show(5)

# 조건에 따라 컬럼 선택
numeric_cols = ["salary", "age"]
df.select(*numeric_cols).describe().show()

+------+----------+--------+
|emp_id|      name|  salary|
+------+----------+--------+
|  E001|Employee_1| 92251.0|
|  E002|Employee_2| 62662.0|
|  E003|Employee_3| 48392.0|
|  E004|Employee_4| 70535.0|
|  E005|Employee_5|118603.0|
+------+----------+--------+
only showing top 5 rows

+-------+------------------+-----------------+
|summary|            salary|              age|
+-------+------------------+-----------------+
|  count|                94|              100|
|   mean| 80864.46808510639|            38.83|
| stddev|22171.614668892573|9.084402216786948|
|    min|           40206.0|               25|
|    max|          119309.0|               54|
+-------+------------------+-----------------+



In [23]:
##실습 2-1: 기본 컬럼 선택
##기본 컬럼 선택 emp_id, name, salary 3개 컬럼만 선택하세요.
df.select("emp_id", "name", "salary").show(5)

+------+----------+--------+
|emp_id|      name|  salary|
+------+----------+--------+
|  E001|Employee_1| 92251.0|
|  E002|Employee_2| 62662.0|
|  E003|Employee_3| 48392.0|
|  E004|Employee_4| 70535.0|
|  E005|Employee_5|118603.0|
+------+----------+--------+
only showing top 5 rows



In [25]:
##실습 2-2: 컬럼 연산과 별칭
##salary 컬럼을 1000으로 나눈 값을 "salary_k"라는 이름으로 조회하세요.
df.select(col("name"),
    col("salary"),
    (col("salary") / 1000).alias("raised_salary")).show(5)

+----------+--------+-------------+
|      name|  salary|raised_salary|
+----------+--------+-------------+
|Employee_1| 92251.0|       92.251|
|Employee_2| 62662.0|       62.662|
|Employee_3| 48392.0|       48.392|
|Employee_4| 70535.0|       70.535|
|Employee_5|118603.0|      118.603|
+----------+--------+-------------+
only showing top 5 rows



In [26]:
##실습 2-3: 컬럼 제외
##is_manager 컬럼을 제외한 나머지 컬럼을 조회하세요.
df.drop("is_manager").show(5)

+------+----------+----------+--------+---+
|emp_id|      name|department|  salary|age|
+------+----------+----------+--------+---+
|  E001|Employee_1|        HR| 92251.0| 28|
|  E002|Employee_2|   Finance| 62662.0| 43|
|  E003|Employee_3| Marketing| 48392.0| 50|
|  E004|Employee_4|   Finance| 70535.0| 27|
|  E005|Employee_5|   Finance|118603.0| 43|
+------+----------+----------+--------+---+
only showing top 5 rows



In [28]:
##실습 2-4: 리스트로 컬럼 선택
##컬럼 목록 리스트 ["name", "department"]를 사용하여 해당 컬럼들을 선택하세요.
col_lest = ["name", "department"]
df.select(*col_lest).show(5)

+----------+----------+
|      name|department|
+----------+----------+
|Employee_1|        HR|
|Employee_2|   Finance|
|Employee_3| Marketing|
|Employee_4|   Finance|
|Employee_5|   Finance|
+----------+----------+
only showing top 5 rows




**Spark DataFrame은 불변(Immutable)**

Pandas:
df["new_col"] = df["old_col"] * 2  # 원본 df 변경

Spark:
df.withColumn("new_col", col("old_col") * 2)  # 새 df 반환
df = df.withColumn(...)  # 결과를 다시 할당해야 유지

★ 중요: withColumn() 결과를 변수에 저장하지 않으면 사라짐!

In [29]:
# -----------------------------------------------------------------------------
# withColumn(): 새 컬럼 추가
# -----------------------------------------------------------------------------

# withColumn(컬럼명, 표현식): 새 컬럼 추가 또는 기존 컬럼 덮어쓰기
# - 첫 번째 인자: 새 컬럼 이름 (문자열)
# - 두 번째 인자: 컬럼 값을 계산하는 표현식 (Column 객체)

# 새 컬럼 추가: 연봉을 월급으로 변환
# col("salary") / 12: salary 컬럼의 모든 값을 12로 나눔
df_with_monthly = df.withColumn(
    "monthly_salary",       # 새 컬럼 이름
    col("salary") / 12      # 계산식
)

print("=== 월급 컬럼 추가 ===")
df_with_monthly.select("name", "salary", "monthly_salary").show(5)

=== 월급 컬럼 추가 ===
+----------+--------+------------------+
|      name|  salary|    monthly_salary|
+----------+--------+------------------+
|Employee_1| 92251.0| 7687.583333333333|
|Employee_2| 62662.0| 5221.833333333333|
|Employee_3| 48392.0|4032.6666666666665|
|Employee_4| 70535.0| 5877.916666666667|
|Employee_5|118603.0| 9883.583333333334|
+----------+--------+------------------+
only showing top 5 rows



In [30]:
# -----------------------------------------------------------------------------
# withColumn(): 상수값 추가 (lit 함수)
# -----------------------------------------------------------------------------

# lit(): 상수(리터럴) 값을 Column으로 변환
# - col("name") → 컬럼 참조
# - lit("Korea") → 상수값 "Korea"
#
# 모든 행에 같은 값을 넣을 때 사용

# 상수 컬럼 추가
df_with_country = df.withColumn(
    "country",      # 컬럼명
    lit("Korea")    # 모든 행에 "Korea" 값
)

print("=== 상수 컬럼 추가 ===")
df_with_country.select("name", "country").show(5)

=== 상수 컬럼 추가 ===
+----------+-------+
|      name|country|
+----------+-------+
|Employee_1|  Korea|
|Employee_2|  Korea|
|Employee_3|  Korea|
|Employee_4|  Korea|
|Employee_5|  Korea|
+----------+-------+
only showing top 5 rows



In [31]:
# -----------------------------------------------------------------------------
# withColumn(): 기존 컬럼 수정 (덮어쓰기)
# -----------------------------------------------------------------------------

# 같은 컬럼명을 사용하면 기존 컬럼을 덮어씀
# salary 컬럼을 10% 인상된 값으로 교체
df_raised = df.withColumn(
    "salary",           # 기존 컬럼명 (덮어쓰기)
    col("salary") * 1.1 # 10% 인상
)

print("=== 급여 10% 인상 (원본 비교) ===")
print("원본:")
df.select("name", "salary").show(3)
print("수정 후:")
df_raised.select("name", "salary").show(3)

=== 급여 10% 인상 (원본 비교) ===
원본:
+----------+-------+
|      name| salary|
+----------+-------+
|Employee_1|92251.0|
|Employee_2|62662.0|
|Employee_3|48392.0|
+----------+-------+
only showing top 3 rows

수정 후:
+----------+------------------+
|      name|            salary|
+----------+------------------+
|Employee_1|          101476.1|
|Employee_2| 68928.20000000001|
|Employee_3|53231.200000000004|
+----------+------------------+
only showing top 3 rows



In [32]:
# -----------------------------------------------------------------------------
# withColumn(): 체이닝 (여러 컬럼 한번에)
# -----------------------------------------------------------------------------

# 여러 withColumn을 연속으로 호출 (메서드 체이닝)
# 각 withColumn은 새 DataFrame을 반환하므로 연결 가능
df_enhanced = (
    df
    # 연봉에서 월급 계산
    .withColumn("monthly_salary", col("salary") / 12)
    # 연봉에서 일급 계산 (연 250일 근무 가정)
    .withColumn("daily_salary", col("salary") / 250)
    # 국가 추가
    .withColumn("country", lit("Korea"))
    # 연도 추가
    .withColumn("year", lit(2024))
)

print("=== 여러 컬럼 추가 ===")
df_enhanced.show(5)

=== 여러 컬럼 추가 ===
+------+----------+----------+--------+---+----------+------------------+------------+-------+----+
|emp_id|      name|department|  salary|age|is_manager|    monthly_salary|daily_salary|country|year|
+------+----------+----------+--------+---+----------+------------------+------------+-------+----+
|  E001|Employee_1|        HR| 92251.0| 28|     false| 7687.583333333333|     369.004|  Korea|2024|
|  E002|Employee_2|   Finance| 62662.0| 43|     false| 5221.833333333333|     250.648|  Korea|2024|
|  E003|Employee_3| Marketing| 48392.0| 50|     false|4032.6666666666665|     193.568|  Korea|2024|
|  E004|Employee_4|   Finance| 70535.0| 27|     false| 5877.916666666667|      282.14|  Korea|2024|
|  E005|Employee_5|   Finance|118603.0| 43|     false| 9883.583333333334|     474.412|  Korea|2024|
+------+----------+----------+--------+---+----------+------------------+------------+-------+----+
only showing top 5 rows



In [33]:
##실습 3-1: 새 컬럼 추가
##age에 1을 더한 "next_age" 컬럼을 추가하세요.
new_df = df.withColumn(('next_age'),col("age") + 1)
new_df.show(5)

+------+----------+----------+--------+---+----------+--------+
|emp_id|      name|department|  salary|age|is_manager|next_age|
+------+----------+----------+--------+---+----------+--------+
|  E001|Employee_1|        HR| 92251.0| 28|     false|      29|
|  E002|Employee_2|   Finance| 62662.0| 43|     false|      44|
|  E003|Employee_3| Marketing| 48392.0| 50|     false|      51|
|  E004|Employee_4|   Finance| 70535.0| 27|     false|      28|
|  E005|Employee_5|   Finance|118603.0| 43|     false|      44|
+------+----------+----------+--------+---+----------+--------+
only showing top 5 rows



In [38]:
##실습 3-2: 상수 컬럼 추가
##모든 행에 "2024"라는 값을 가진 "year" 컬럼을 추가하세요.
new_df = df.withColumn("year", lit("2024")).select("name", "year").show(5)


+----------+----+
|      name|year|
+----------+----+
|Employee_1|2024|
|Employee_2|2024|
|Employee_3|2024|
|Employee_4|2024|
|Employee_5|2024|
+----------+----+
only showing top 5 rows



In [40]:
##실습 3-3: 기존 컬럼 수정
##age 컬럼의 값을 모두 10 증가시킨 DataFrame을 만드세요.

df_age = df.withColumn(
    "age",           # 기존 컬럼명 (덮어쓰기)
    col("age") + 10 
)
df_age.select("name", "age").show(5)

+----------+---+
|      name|age|
+----------+---+
|Employee_1| 38|
|Employee_2| 53|
|Employee_3| 60|
|Employee_4| 37|
|Employee_5| 53|
+----------+---+
only showing top 5 rows



In [41]:
##실습 3-4: 여러 컬럼 추가 (체이닝)
##salary의 20%를 "tax", salary에서 tax를 뺀 "net_salary" 컬럼을 추가하세요.
df_enhanced = (
    df
    .withColumn("tax", col("salary") * 0.2)
    .withColumn("net_salary", col("salary") - col("salary") * 0.2)
)

df_enhanced.select("name", "salary", "tax", "net_salary").show(5)

+----------+--------+------------------+----------+
|      name|  salary|               tax|net_salary|
+----------+--------+------------------+----------+
|Employee_1| 92251.0|           18450.2|   73800.8|
|Employee_2| 62662.0|12532.400000000001|   50129.6|
|Employee_3| 48392.0|            9678.4|   38713.6|
|Employee_4| 70535.0|           14107.0|   56428.0|
|Employee_5|118603.0|23720.600000000002|   94882.4|
+----------+--------+------------------+----------+
only showing top 5 rows



In [42]:
##Part 4: 컬럼 이름 변경과 삭제
# -----------------------------------------------------------------------------
# withColumnRenamed(): 컬럼 이름 변경
# -----------------------------------------------------------------------------

# withColumnRenamed(기존이름, 새이름): 단일 컬럼 이름 변경
# 원본 DataFrame은 변경되지 않음 (새 DataFrame 반환)
df_renamed = df.withColumnRenamed("emp_id", "employee_id")

print("=== 컬럼명 변경: emp_id → employee_id ===")
df_renamed.printSchema()

=== 컬럼명 변경: emp_id → employee_id ===
root
 |-- employee_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_manager: boolean (nullable = true)



In [43]:
# -----------------------------------------------------------------------------
# 여러 컬럼 이름 변경
# -----------------------------------------------------------------------------

# 방법 1: withColumnRenamed 체이닝
df_multi_renamed = (
    df
    .withColumnRenamed("emp_id", "employee_id")
    .withColumnRenamed("department", "dept")
    .withColumnRenamed("is_manager", "manager_flag")
)

print("=== 여러 컬럼명 변경 ===")
df_multi_renamed.printSchema()

# 방법 2: toDF() - 모든 컬럼명 한번에 변경
# 주의: 컬럼 순서와 개수가 정확히 일치해야 함
# df.toDF("new_col1", "new_col2", ...) - 모든 컬럼에 새 이름 지정

=== 여러 컬럼명 변경 ===
root
 |-- employee_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- manager_flag: boolean (nullable = true)



In [44]:
# -----------------------------------------------------------------------------
# drop(): 컬럼 삭제
# -----------------------------------------------------------------------------

# drop(컬럼명): 지정한 컬럼 제거
# 여러 컬럼 제거: drop("col1", "col2") 또는 drop("col1").drop("col2")

# 단일 컬럼 삭제
df_no_manager = df.drop("is_manager")

print("=== is_manager 컬럼 삭제 ===")
print(f"삭제 전 컬럼: {df.columns}")
print(f"삭제 후 컬럼: {df_no_manager.columns}")

# 여러 컬럼 삭제
df_minimal = df.drop("is_manager", "age")
print(f"여러 컬럼 삭제 후: {df_minimal.columns}")

=== is_manager 컬럼 삭제 ===
삭제 전 컬럼: ['emp_id', 'name', 'department', 'salary', 'age', 'is_manager']
삭제 후 컬럼: ['emp_id', 'name', 'department', 'salary', 'age']
여러 컬럼 삭제 후: ['emp_id', 'name', 'department', 'salary']


In [47]:
##실습 4-1: 컬럼 이름 변경
##salary 컬럼의 이름을 "annual_salary"로 변경하세요.
df_renamed = df.withColumnRenamed("salary","annual_salary")
df_renamed.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- annual_salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_manager: boolean (nullable = true)



In [49]:
##실습 4-2: 여러 컬럼 이름 변경
##emp_id를 "id"로, name을 "employee_name"으로 변경하세요.

df_multi_renamed = (
    df
    .withColumnRenamed("emp_id", "id")
    .withColumnRenamed("name", "employee_name")
)
df_multi_renamed.printSchema()

root
 |-- id: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_manager: boolean (nullable = true)



In [50]:
#실습 4-3: 컬럼 삭제
#age와 is_manager 두 컬럼을 삭제하세요.

df_minimal = df.drop("age", "is_manager")
df_minimal.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: double (nullable = true)



In [51]:
#Part 5: 필터링 (filter / where)
# -----------------------------------------------------------------------------
# filter(): 기본 필터링
# -----------------------------------------------------------------------------

# 조건: col("컬럼") 연산자 값
# 결과: 조건을 만족하는 행만 포함된 새 DataFrame

# 급여가 80000 이상인 직원
high_salary = df.filter(col("salary") >= 80000)
print(f"=== 고연봉자 (80000 이상): {high_salary.count()}명 ===")
high_salary.show(5)

# 특정 부서 직원
engineers = df.filter(col("department") == "Engineering")
print(f"=== Engineering 부서: {engineers.count()}명 ===")
engineers.show(5)

=== 고연봉자 (80000 이상): 51명 ===
+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E001| Employee_1|        HR| 92251.0| 28|     false|
|  E005| Employee_5|   Finance|118603.0| 43|     false|
|  E013|Employee_13|   Finance|110592.0| 42|     false|
|  E015|Employee_15|        HR|119309.0| 25|      true|
|  E017|Employee_17|      NULL| 92992.0| 52|     false|
+------+-----------+----------+--------+---+----------+
only showing top 5 rows

=== Engineering 부서: 17명 ===
+------+-----------+-----------+--------+---+----------+
|emp_id|       name| department|  salary|age|is_manager|
+------+-----------+-----------+--------+---+----------+
|  E024|Employee_24|Engineering| 58141.0| 27|     false|
|  E025|Employee_25|Engineering|111910.0| 31|     false|
|  E034|Employee_34|Engineering|104044.0| 26|     false|
|  E039|Employee_39|Engineering| 65939.0| 29|     false|
|  E042|Employe

In [52]:
# -----------------------------------------------------------------------------
# filter(): AND / OR 조건 (괄호 필수!)
# -----------------------------------------------------------------------------

# AND 조건: & 연산자 (각 조건을 괄호로 감싸야 함!)
# 이유: Python 연산자 우선순위 때문
# 틀린 예: col("age") > 30 & col("salary") > 70000  # 에러!
# 맞는 예: (col("age") > 30) & (col("salary") > 70000)

# AND: 30세 이상이면서 급여 70000 이상
senior_high = df.filter(
    (col("age") >= 30) & (col("salary") >= 70000)
)
print(f"=== 30세 이상 AND 고연봉: {senior_high.count()}명 ===")
senior_high.show(5)

# OR: Engineering이거나 Sales 부서
eng_or_sales = df.filter(
    (col("department") == "Engineering") | (col("department") == "Sales")
)
print(f"=== Engineering OR Sales: {eng_or_sales.count()}명 ===")

=== 30세 이상 AND 고연봉: 44명 ===
+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E005| Employee_5|   Finance|118603.0| 43|     false|
|  E013|Employee_13|   Finance|110592.0| 42|     false|
|  E017|Employee_17|      NULL| 92992.0| 52|     false|
|  E021|Employee_21|     Sales| 90636.0| 47|     false|
|  E022|Employee_22|   Finance| 90015.0| 54|     false|
+------+-----------+----------+--------+---+----------+
only showing top 5 rows

=== Engineering OR Sales: 37명 ===


In [53]:
# -----------------------------------------------------------------------------
# filter(): SQL 스타일 문자열 조건
# -----------------------------------------------------------------------------

# 문자열로 SQL WHERE 절처럼 작성 가능
# 더 직관적일 수 있음 (SQL에 익숙하면)

# SQL 스타일 필터링
df.filter("age >= 30 AND salary >= 70000").show(5)

# SQL 스타일: BETWEEN
df.filter("salary BETWEEN 50000 AND 80000").show(5)

# SQL 스타일: IN
df.filter("department IN ('Engineering', 'Sales', 'Marketing')").show(5)

# SQL 스타일: LIKE (문자열 패턴)
df.filter("name LIKE 'Employee_1%'").show(5)

+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E005| Employee_5|   Finance|118603.0| 43|     false|
|  E013|Employee_13|   Finance|110592.0| 42|     false|
|  E017|Employee_17|      NULL| 92992.0| 52|     false|
|  E021|Employee_21|     Sales| 90636.0| 47|     false|
|  E022|Employee_22|   Finance| 90015.0| 54|     false|
+------+-----------+----------+--------+---+----------+
only showing top 5 rows

+------+-----------+----------+-------+---+----------+
|emp_id|       name|department| salary|age|is_manager|
+------+-----------+----------+-------+---+----------+
|  E002| Employee_2|   Finance|62662.0| 43|     false|
|  E004| Employee_4|   Finance|70535.0| 27|     false|
|  E012|Employee_12| Marketing|64538.0| 31|     false|
|  E016|Employee_16|      NULL|67266.0| 35|     false|
|  E020|Employee_20|        HR|63419.0| 42|     false|
+------+-----------+----------+

In [54]:
# -----------------------------------------------------------------------------
# filter(): isin(), isNull(), isNotNull()
# -----------------------------------------------------------------------------

# isin(): 여러 값 중 하나인지 확인
# SQL의 IN 절과 동일
target_depts = ["Engineering", "Sales"]
df.filter(col("department").isin(target_depts)).show(5)

# isin()에 리스트 직접 전달
df.filter(col("department").isin("HR", "Finance")).show(5)

# isNull(): NULL인 행만
df.filter(col("salary").isNull()).show()

# isNotNull(): NULL이 아닌 행만
df.filter(col("salary").isNotNull()).count()

+------+-----------+-----------+--------+---+----------+
|emp_id|       name| department|  salary|age|is_manager|
+------+-----------+-----------+--------+---+----------+
|  E006| Employee_6|      Sales|    NULL| 44|     false|
|  E014|Employee_14|      Sales| 48110.0| 32|     false|
|  E021|Employee_21|      Sales| 90636.0| 47|     false|
|  E024|Employee_24|Engineering| 58141.0| 27|     false|
|  E025|Employee_25|Engineering|111910.0| 31|     false|
+------+-----------+-----------+--------+---+----------+
only showing top 5 rows

+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E001| Employee_1|        HR| 92251.0| 28|     false|
|  E002| Employee_2|   Finance| 62662.0| 43|     false|
|  E004| Employee_4|   Finance| 70535.0| 27|     false|
|  E005| Employee_5|   Finance|118603.0| 43|     false|
|  E010|Employee_10|   Finance|    NULL| 25|     false|
+------+------

94

In [55]:
# -----------------------------------------------------------------------------
# filter(): 문자열 조건 메서드
# -----------------------------------------------------------------------------

# startswith(): 특정 문자로 시작
df.filter(col("name").startswith("Employee_1")).show(5)

# endswith(): 특정 문자로 끝
df.filter(col("emp_id").endswith("5")).show(5)

# contains(): 특정 문자 포함
df.filter(col("department").contains("ing")).show(5)  # Engineering, Marketing

+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E001| Employee_1|        HR| 92251.0| 28|     false|
|  E010|Employee_10|   Finance|    NULL| 25|     false|
|  E011|Employee_11|        HR|    NULL| 32|     false|
|  E012|Employee_12| Marketing| 64538.0| 31|     false|
|  E013|Employee_13|   Finance|110592.0| 42|     false|
+------+-----------+----------+--------+---+----------+
only showing top 5 rows

+------+-----------+-----------+--------+---+----------+
|emp_id|       name| department|  salary|age|is_manager|
+------+-----------+-----------+--------+---+----------+
|  E005| Employee_5|    Finance|118603.0| 43|     false|
|  E015|Employee_15|         HR|119309.0| 25|      true|
|  E025|Employee_25|Engineering|111910.0| 31|     false|
|  E035|Employee_35|  Marketing| 82557.0| 25|      true|
|  E045|Employee_45|      Sales|115766.0| 27|     false|
+------+-------

In [56]:
# -----------------------------------------------------------------------------
# where(): filter()와 동일 (SQL 친화적 이름)
# -----------------------------------------------------------------------------

# where()는 filter()의 별칭 (alias)
# SQL에 익숙한 사람을 위한 이름

# filter()와 완전히 동일한 동작
df.where(col("age") >= 30).show(5)
df.where("salary > 60000").show(5)

+------+----------+----------+--------+---+----------+
|emp_id|      name|department|  salary|age|is_manager|
+------+----------+----------+--------+---+----------+
|  E002|Employee_2|   Finance| 62662.0| 43|     false|
|  E003|Employee_3| Marketing| 48392.0| 50|     false|
|  E005|Employee_5|   Finance|118603.0| 43|     false|
|  E006|Employee_6|     Sales|    NULL| 44|     false|
|  E007|Employee_7| Marketing|    NULL| 31|     false|
+------+----------+----------+--------+---+----------+
only showing top 5 rows

+------+-----------+----------+--------+---+----------+
|emp_id|       name|department|  salary|age|is_manager|
+------+-----------+----------+--------+---+----------+
|  E001| Employee_1|        HR| 92251.0| 28|     false|
|  E002| Employee_2|   Finance| 62662.0| 43|     false|
|  E004| Employee_4|   Finance| 70535.0| 27|     false|
|  E005| Employee_5|   Finance|118603.0| 43|     false|
|  E012|Employee_12| Marketing| 64538.0| 31|     false|
+------+-----------+----------+-

In [57]:
##실습 5-1: 기본 필터링
##salary가 70000 이상인 직원만 조회하세요
df.filter(col("salary") >= 70000).select("name", "salary").show()

+-----------+--------+
|       name|  salary|
+-----------+--------+
| Employee_1| 92251.0|
| Employee_4| 70535.0|
| Employee_5|118603.0|
|Employee_13|110592.0|
|Employee_15|119309.0|
|Employee_17| 92992.0|
|Employee_21| 90636.0|
|Employee_22| 90015.0|
|Employee_23| 94268.0|
|Employee_25|111910.0|
|Employee_26| 96044.0|
|Employee_27|107214.0|
|Employee_28| 73827.0|
|Employee_29| 95820.0|
|Employee_30|102623.0|
|Employee_31|115450.0|
|Employee_33| 83585.0|
|Employee_34|104044.0|
|Employee_35| 82557.0|
|Employee_36| 89080.0|
+-----------+--------+
only showing top 20 rows



In [None]:
#department가 "Engineering"이면서 age가 35 이상인 직원을 조회하세요.
senior_high = df.filter(
    (col("age") >= 35) & (col("department") == "Engineering")
)
senior_high.show(5)
#.select("name", "department", "age").show()

+------+-----------+-----------+-------+---+----------+
|emp_id|       name| department| salary|age|is_manager|
+------+-----------+-----------+-------+---+----------+
|  E042|Employee_42|Engineering|61834.0| 47|     false|
|  E046|Employee_46|Engineering|55707.0| 43|     false|
|  E056|Employee_56|Engineering|98053.0| 44|      true|
|  E071|Employee_71|Engineering|92733.0| 53|     false|
|  E085|Employee_85|Engineering|92662.0| 38|     false|
+------+-----------+-----------+-------+---+----------+
only showing top 5 rows



In [None]:
#실습 5-3: OR 조건
#department가 "Sales" 또는 "Marketing"인 직원을 조회하세요.
condition_df = df.filter(
    (col("department") == "Sales") | (col("department") == "Marketing")
)

condition_df.show(5)
# 방법 2: isin
df.filter(col("department").isin("Sales", "Marketing")).show(5)

+------+----------+----------+-------+---+----------+
|emp_id|      name|department| salary|age|is_manager|
+------+----------+----------+-------+---+----------+
|  E003|Employee_3| Marketing|48392.0| 50|     false|
|  E006|Employee_6|     Sales|   NULL| 44|     false|
|  E007|Employee_7| Marketing|   NULL| 31|     false|
|  E008|Employee_8| Marketing|   NULL| 44|     false|
|  E009|Employee_9| Marketing|   NULL| 33|     false|
+------+----------+----------+-------+---+----------+
only showing top 5 rows

+------+----------+----------+-------+---+----------+
|emp_id|      name|department| salary|age|is_manager|
+------+----------+----------+-------+---+----------+
|  E003|Employee_3| Marketing|48392.0| 50|     false|
|  E006|Employee_6|     Sales|   NULL| 44|     false|
|  E007|Employee_7| Marketing|   NULL| 31|     false|
|  E008|Employee_8| Marketing|   NULL| 44|     false|
|  E009|Employee_9| Marketing|   NULL| 33|     false|
+------+----------+----------+-------+---+----------+
onl

In [68]:
#실습 5-4: NULL 필터링
#salary가 NULL인 행을 조회하세요.

df.filter(col("salary").isNull()).show()

+------+-----------+----------+------+---+----------+
|emp_id|       name|department|salary|age|is_manager|
+------+-----------+----------+------+---+----------+
|  E006| Employee_6|     Sales|  NULL| 44|     false|
|  E007| Employee_7| Marketing|  NULL| 31|     false|
|  E008| Employee_8| Marketing|  NULL| 44|     false|
|  E009| Employee_9| Marketing|  NULL| 33|     false|
|  E010|Employee_10|   Finance|  NULL| 25|     false|
|  E011|Employee_11|        HR|  NULL| 32|     false|
+------+-----------+----------+------+---+----------+



#### 실습 5-5: 문자열 조건
##### name이 "Employee_1"로 시작하는 직원을 조회하세요.

In [7]:
df.filter(col("name").startswith("Employee_1")).select("name").show()

+------------+
|        name|
+------------+
|  Employee_1|
| Employee_10|
| Employee_11|
| Employee_12|
| Employee_13|
| Employee_14|
| Employee_15|
| Employee_16|
| Employee_17|
| Employee_18|
| Employee_19|
|Employee_100|
+------------+



#### Part 6: 조건부 값 설정 (when / otherwise)
##### when/otherwise = SQL의 CASE WHEN

In [8]:
# -----------------------------------------------------------------------------
# when(): 단일 조건
# -----------------------------------------------------------------------------

# when(조건, 참일때값): 조건이 참이면 지정값, 거짓이면 null
# otherwise(값): 모든 when 조건이 거짓일 때 값

# 성인 여부 판단
df_adult = df.withColumn(
    "is_adult",
    # when(조건, 참일때값).otherwise(거짓일때값)
    when(col("age") >= 18, "Yes").otherwise("No")
)

df_adult.select("name", "age", "is_adult").show(5)

+----------+---+--------+
|      name|age|is_adult|
+----------+---+--------+
|Employee_1| 28|     Yes|
|Employee_2| 43|     Yes|
|Employee_3| 50|     Yes|
|Employee_4| 27|     Yes|
|Employee_5| 43|     Yes|
+----------+---+--------+
only showing top 5 rows



In [10]:
# -----------------------------------------------------------------------------
# when(): 다중 조건 (if-elif-else)
# -----------------------------------------------------------------------------

# 여러 when을 체이닝: 첫 번째로 참인 조건의 값 반환
# otherwise: 모든 조건이 거짓일 때 값 (생략 시 null)

# 연령대 분류
df_age_group = df.withColumn(
    "age_group",
    #첫번째 참인  조건에서  멈춤
    when(col("age") >= 50,"50대 이상") # 50 이상이면 "50대 이상"
    .when(col("age") >= 40,"40대") # 40~49면 "40대"
    .when(col("age") >= 30,"30대") # 30~39면 "30대"
    .otherwise("20대")   # 나머지는 "20대"
)

print("===연령대 분류=====")
df_age_group.select("name","age","age_group").show(10)
# 연령대별 인원 수
df_age_group.groupBy("age_group").count().show()

===연령대 분류=====
+-----------+---+---------+
|       name|age|age_group|
+-----------+---+---------+
| Employee_1| 28|     20대|
| Employee_2| 43|     40대|
| Employee_3| 50|50대 이상|
| Employee_4| 27|     20대|
| Employee_5| 43|     40대|
| Employee_6| 44|     40대|
| Employee_7| 31|     30대|
| Employee_8| 44|     40대|
| Employee_9| 33|     30대|
|Employee_10| 25|     20대|
+-----------+---+---------+
only showing top 10 rows

+---------+-----+
|age_group|count|
+---------+-----+
|     20대|   23|
|50대 이상|   13|
|     40대|   43|
|     30대|   21|
+---------+-----+



In [12]:
# -----------------------------------------------------------------------------
# when(): 급여 등급 분류 (실무 예제)
# -----------------------------------------------------------------------------

#급여 등급 분류
df_salary_grade = df.withColumn(
    "salary_grade",
    when(col("salary") >= 100000, "S")
    .when(col("salary") >= 100000, "A")
    .when(col("salary") >= 100000, "B")
    .when(col("salary") >= 100000, "C")
    .otherwise("D")
)
print("=== 급여 등급 ===")
df_salary_grade.select("name", "salary", "salary_grade").show(10)
# 등급별 인원 분포
df_salary_grade.groupBy("salary_grade").count().orderBy("salary_grade").show()

=== 급여 등급 ===
+-----------+--------+------------+
|       name|  salary|salary_grade|
+-----------+--------+------------+
| Employee_1| 92251.0|           D|
| Employee_2| 62662.0|           D|
| Employee_3| 48392.0|           D|
| Employee_4| 70535.0|           D|
| Employee_5|118603.0|           S|
| Employee_6|    NULL|           D|
| Employee_7|    NULL|           D|
| Employee_8|    NULL|           D|
| Employee_9|    NULL|           D|
|Employee_10|    NULL|           D|
+-----------+--------+------------+
only showing top 10 rows

+------------+-----+
|salary_grade|count|
+------------+-----+
|           D|   77|
|           S|   23|
+------------+-----+



#### when(): NULL 처리와 결합

In [13]:
# NULL을 특정 값으로 대체하면서 조건 분기
df_null_handled = df.withColumn(
    "salary_status",
    when(col("salary").isNull(), "미입력")           # NULL이면 "미입력"
    .when(col("salary") >= 80000, "고연봉")          # 8만 이상
    .when(col("salary") >= 50000, "중연봉")          # 5만~8만
    .otherwise("저연봉")                              # 5만 미만
)

print("=== NULL 처리 포함 급여 상태 ===")
df_null_handled.select("name", "salary", "salary_status").show(15)

=== NULL 처리 포함 급여 상태 ===
+-----------+--------+-------------+
|       name|  salary|salary_status|
+-----------+--------+-------------+
| Employee_1| 92251.0|       고연봉|
| Employee_2| 62662.0|       중연봉|
| Employee_3| 48392.0|       저연봉|
| Employee_4| 70535.0|       중연봉|
| Employee_5|118603.0|       고연봉|
| Employee_6|    NULL|       미입력|
| Employee_7|    NULL|       미입력|
| Employee_8|    NULL|       미입력|
| Employee_9|    NULL|       미입력|
|Employee_10|    NULL|       미입력|
|Employee_11|    NULL|       미입력|
|Employee_12| 64538.0|       중연봉|
|Employee_13|110592.0|       고연봉|
|Employee_14| 48110.0|       저연봉|
|Employee_15|119309.0|       고연봉|
+-----------+--------+-------------+
only showing top 15 rows



#### 실습 6-1: 단일 조건
##### is_manager가 True이면 "Manager", 아니면 "Staff"인 "role" 컬럼을 추가하세요.

In [17]:
df_adult = df.withColumn(
    "role",
    # when(조건, 참일때값).otherwise(거짓일때값)
    when(col("is_manager") == "True", "Manager")
    .otherwise("Staff")
).select("name", "is_manager", "role").show(5)

+----------+----------+-----+
|      name|is_manager| role|
+----------+----------+-----+
|Employee_1|     false|Staff|
|Employee_2|     false|Staff|
|Employee_3|     false|Staff|
|Employee_4|     false|Staff|
|Employee_5|     false|Staff|
+----------+----------+-----+
only showing top 5 rows



#### 실습 6-2: 다중 조건
##### age 기준으로 "age_band" 컬럼을 추가하세요:

In [20]:
df_age_grade = df.withColumn(
    "age_band",
    when(col("age") >= 40, "40+")
    .when(col("age") >= 30, "30s")
    .otherwise("20s")
)
df_age_grade.select(col("name"),col("age"),col("age_band")).show(5)

+----------+---+--------+
|      name|age|age_band|
+----------+---+--------+
|Employee_1| 28|     20s|
|Employee_2| 43|     40+|
|Employee_3| 50|     40+|
|Employee_4| 27|     20s|
|Employee_5| 43|     40+|
+----------+---+--------+
only showing top 5 rows



#### 실습 6-3: 급여 구간 분류
##### salary 기준으로 "salary_level" 컬럼을 추가하세요

In [21]:
df.withColumn(
    "salary_level",
    when(col("salary") >= 90000, "High")
    .when(col("salary") >= 60000, "Medium")
    .otherwise("Low")
).select("name", "salary", "salary_level").show(10)

+-----------+--------+------------+
|       name|  salary|salary_level|
+-----------+--------+------------+
| Employee_1| 92251.0|        High|
| Employee_2| 62662.0|      Medium|
| Employee_3| 48392.0|         Low|
| Employee_4| 70535.0|      Medium|
| Employee_5|118603.0|        High|
| Employee_6|    NULL|         Low|
| Employee_7|    NULL|         Low|
| Employee_8|    NULL|         Low|
| Employee_9|    NULL|         Low|
|Employee_10|    NULL|         Low|
+-----------+--------+------------+
only showing top 10 rows



#### Part 7: 정렬, 중복 제거, 제한

#### orderBy(): 정렬

In [None]:
# -----------------------------------------------------------------------------
# orderBy(): 정렬
# -----------------------------------------------------------------------------

# orderBy(): 지정 컬럼 기준 정렬
# - 기본: 오름차순 (ASC)
# - 내림차순: col("컬럼").desc()

# 급여 기준 오름차순 (기본)
df.orderBy("salary").select("name", "salary").show(5)

# 급여 기준 내림차순
df.orderBy(col("salary").desc()).select("name", "salary").show(5)

# 여러 컬럼 정렬: 부서 오름차순 → 급여 내림차순
df.orderBy(
    col("department").asc(),    # 부서 오름차순
    col("salary").desc()        # 급여 내림차순
).select("department", "name", "salary").show(10)

#### distinct(): 중복 제거

In [23]:
# -----------------------------------------------------------------------------
# distinct(): 중복 제거
# -----------------------------------------------------------------------------

# distinct(): 전체 행이 동일한 중복 제거
# 반환: 고유한 행만 포함된 DataFrame

# 부서 목록 (고유값)
departments = df.select("department").distinct()
print("=== 부서 목록 ===")
departments.show()

# dropDuplicates(): 특정 컬럼 기준 중복 제거
# 해당 컬럼 값이 같은 행 중 첫 번째만 유지
df.dropDuplicates(["department"]).select("department", "name").show()

=== 부서 목록 ===
+-----------+
| department|
+-----------+
|         HR|
|  Marketing|
|      Sales|
|Engineering|
|    Finance|
|       NULL|
+-----------+

+-----------+-----------+
| department|       name|
+-----------+-----------+
|       NULL|Employee_16|
|Engineering|Employee_24|
|    Finance| Employee_2|
|         HR| Employee_1|
|  Marketing| Employee_3|
|      Sales| Employee_6|
+-----------+-----------+



#### limit(): 상위 N개

In [24]:
# -----------------------------------------------------------------------------
# limit(): 상위 N개
# -----------------------------------------------------------------------------

# limit(n): 상위 n개 행만 반환
# 주의: 정렬 없이 limit만 쓰면 순서가 보장되지 않음

# 상위 5개
df.limit(5).show()

# 급여 상위 5명 (정렬 후 limit)
df.orderBy(col("salary").desc()).limit(5).select("name", "salary").show()

+------+----------+----------+--------+---+----------+
|emp_id|      name|department|  salary|age|is_manager|
+------+----------+----------+--------+---+----------+
|  E001|Employee_1|        HR| 92251.0| 28|     false|
|  E002|Employee_2|   Finance| 62662.0| 43|     false|
|  E003|Employee_3| Marketing| 48392.0| 50|     false|
|  E004|Employee_4|   Finance| 70535.0| 27|     false|
|  E005|Employee_5|   Finance|118603.0| 43|     false|
+------+----------+----------+--------+---+----------+

+-----------+--------+
|       name|  salary|
+-----------+--------+
|Employee_15|119309.0|
| Employee_5|118603.0|
|Employee_45|115766.0|
|Employee_31|115450.0|
|Employee_63|113530.0|
+-----------+--------+



In [26]:
# -----------------------------------------------------------------------------
# 실무 패턴: Top N 뽑기
# -----------------------------------------------------------------------------

# 부서별 최고 연봉자 1명씩 (간단 버전)
# 실제로는 Window 함수 사용이 더 정확함 (4교시에서 다룸)

# 정렬 후 부서별 첫 번째 행만
top_by_dept = (
    df.orderBy(col("salary").desc())
    .dropDuplicates(["department"])
    .select("department", "name", "salary")
    .orderBy("department")
)
print("=== 부서별 최고 연봉자 (간단 버전) ===")
top_by_dept.show()

=== 부서별 최고 연봉자 (간단 버전) ===
+-----------+-----------+--------+
| department|       name|  salary|
+-----------+-----------+--------+
|       NULL|Employee_17| 92992.0|
|Engineering|Employee_25|111910.0|
|    Finance| Employee_5|118603.0|
|         HR|Employee_15|119309.0|
|  Marketing|Employee_31|115450.0|
|      Sales|Employee_45|115766.0|
+-----------+-----------+--------+



####  실습 7-1: 정렬
##### salary 기준 내림차순으로 정렬하여 상위 10명을 조회하세요.

In [28]:
df.orderBy(col("salary").desc()).limit(10).show()

+------+-----------+-----------+--------+---+----------+
|emp_id|       name| department|  salary|age|is_manager|
+------+-----------+-----------+--------+---+----------+
|  E015|Employee_15|         HR|119309.0| 25|      true|
|  E005| Employee_5|    Finance|118603.0| 43|     false|
|  E045|Employee_45|      Sales|115766.0| 27|     false|
|  E031|Employee_31|  Marketing|115450.0| 44|     false|
|  E063|Employee_63|    Finance|113530.0| 44|      true|
|  E025|Employee_25|Engineering|111910.0| 31|     false|
|  E013|Employee_13|    Finance|110592.0| 42|     false|
|  E084|Employee_84|    Finance|110467.0| 25|      true|
|  E038|Employee_38|    Finance|109163.0| 36|     false|
|  E065|Employee_65|      Sales|108840.0| 27|     false|
+------+-----------+-----------+--------+---+----------+



#### 실습 7-2: 다중 정렬
##### department 오름차순, age 내림차순으로 정렬하세요.

In [31]:
df.orderBy(
    col("department").asc(),    #  오름차순
    col("age").desc()        # 내림차순
).select("name","department", "age").show(10)

+------------+-----------+---+
|        name| department|age|
+------------+-----------+---+
| Employee_17|       NULL| 52|
| Employee_18|       NULL| 49|
| Employee_19|       NULL| 49|
| Employee_16|       NULL| 35|
| Employee_71|Engineering| 53|
| Employee_42|Engineering| 47|
|Employee_100|Engineering| 47|
| Employee_86|Engineering| 45|
| Employee_56|Engineering| 44|
| Employee_88|Engineering| 44|
+------------+-----------+---+
only showing top 10 rows



#### 실습 7-3: 중복 제거
##### 고유한 department 목록을 조회하세요.

In [32]:
departments = df.select("department").distinct()
print("=== 부서 목록 ===")
departments.show()

=== 부서 목록 ===
+-----------+
| department|
+-----------+
|         HR|
|  Marketing|
|      Sales|
|Engineering|
|    Finance|
|       NULL|
+-----------+



#### 실습 7-4: 연봉 상위 3명
##### 급여가 가장 높은 직원 3명의 이름과 급여를 조회하세요.

In [33]:
df.orderBy(col("salary").desc()).limit(3).select("name", "salary").show()

+-----------+--------+
|       name|  salary|
+-----------+--------+
|Employee_15|119309.0|
| Employee_5|118603.0|
|Employee_45|115766.0|
+-----------+--------+



#### 과제: 인사팀 월간 리포트 생성

##### Step 1: 컬럼 선택 및 이름 정리

In [51]:
result = df.select(
    col("emp_id").alias("사원번호"),                                    
    col("name").alias("이름"),                                  
    col("department").alias("부서"),  
    col("salary").alias("연봉"),
    col("age").alias("나이")  
)
result.show(5)

+--------+----------+---------+--------+----+
|사원번호|      이름|     부서|    연봉|나이|
+--------+----------+---------+--------+----+
|    E001|Employee_1|       HR| 92251.0|  28|
|    E002|Employee_2|  Finance| 62662.0|  43|
|    E003|Employee_3|Marketing| 48392.0|  50|
|    E004|Employee_4|  Finance| 70535.0|  27|
|    E005|Employee_5|  Finance|118603.0|  43|
+--------+----------+---------+--------+----+
only showing top 5 rows



#### Step 2: 파생 컬럼 추가

In [52]:
new_result = (result.withColumn(
    "급여등급",
    when(col("연봉") >= 100000, "S")
    .when(col("연봉") >= 80000, "A")
    .when(col("연봉") >= 60000, "B")
    .otherwise("C")
)
.withColumn("세후연봉", (col("연봉") * 0.8).cast("int"))
.withColumn("월급", (col("연봉") / 12).cast("int"))
)
new_result.show()

+--------+-----------+---------+--------+----+--------+--------+----+
|사원번호|       이름|     부서|    연봉|나이|급여등급|세후연봉|월급|
+--------+-----------+---------+--------+----+--------+--------+----+
|    E001| Employee_1|       HR| 92251.0|  28|       A|   73800|7687|
|    E002| Employee_2|  Finance| 62662.0|  43|       B|   50129|5221|
|    E003| Employee_3|Marketing| 48392.0|  50|       C|   38713|4032|
|    E004| Employee_4|  Finance| 70535.0|  27|       B|   56428|5877|
|    E005| Employee_5|  Finance|118603.0|  43|       S|   94882|9883|
|    E006| Employee_6|    Sales|    NULL|  44|       C|    NULL|NULL|
|    E007| Employee_7|Marketing|    NULL|  31|       C|    NULL|NULL|
|    E008| Employee_8|Marketing|    NULL|  44|       C|    NULL|NULL|
|    E009| Employee_9|Marketing|    NULL|  33|       C|    NULL|NULL|
|    E010|Employee_10|  Finance|    NULL|  25|       C|    NULL|NULL|
|    E011|Employee_11|       HR|    NULL|  32|       C|    NULL|NULL|
|    E012|Employee_12|Marketing| 64538.0| 

#### Step 3: 고성과자 필터링

In [53]:
result_fliter = new_result.filter((col("급여등급").isin("S", "A")) &(col("나이") >= 35 ).isNotNull())

result_fliter.show()
print(f"고성과자 수: {result_fliter.count()}명")

+--------+-----------+-----------+--------+----+--------+--------+----+
|사원번호|       이름|       부서|    연봉|나이|급여등급|세후연봉|월급|
+--------+-----------+-----------+--------+----+--------+--------+----+
|    E001| Employee_1|         HR| 92251.0|  28|       A|   73800|7687|
|    E005| Employee_5|    Finance|118603.0|  43|       S|   94882|9883|
|    E013|Employee_13|    Finance|110592.0|  42|       S|   88473|9216|
|    E015|Employee_15|         HR|119309.0|  25|       S|   95447|9942|
|    E017|Employee_17|       NULL| 92992.0|  52|       A|   74393|7749|
|    E021|Employee_21|      Sales| 90636.0|  47|       A|   72508|7553|
|    E022|Employee_22|    Finance| 90015.0|  54|       A|   72012|7501|
|    E023|Employee_23|         HR| 94268.0|  34|       A|   75414|7855|
|    E025|Employee_25|Engineering|111910.0|  31|       S|   89528|9325|
|    E026|Employee_26|  Marketing| 96044.0|  52|       A|   76835|8003|
|    E027|Employee_27|  Marketing|107214.0|  40|       S|   85771|8934|
|    E029|Empl

   #### Step 4: 정렬 및 최종 정리

In [54]:
final_result = (result_fliter.orderBy(
    col("연봉").desc(),    # 부서 오름차순
    col("이름").asc()        
).limit(20)
.drop("나이")
 )
final_result.show()

+--------+-----------+-----------+--------+--------+--------+----+
|사원번호|       이름|       부서|    연봉|급여등급|세후연봉|월급|
+--------+-----------+-----------+--------+--------+--------+----+
|    E015|Employee_15|         HR|119309.0|       S|   95447|9942|
|    E005| Employee_5|    Finance|118603.0|       S|   94882|9883|
|    E045|Employee_45|      Sales|115766.0|       S|   92612|9647|
|    E031|Employee_31|  Marketing|115450.0|       S|   92360|9620|
|    E063|Employee_63|    Finance|113530.0|       S|   90824|9460|
|    E025|Employee_25|Engineering|111910.0|       S|   89528|9325|
|    E013|Employee_13|    Finance|110592.0|       S|   88473|9216|
|    E084|Employee_84|    Finance|110467.0|       S|   88373|9205|
|    E038|Employee_38|    Finance|109163.0|       S|   87330|9096|
|    E065|Employee_65|      Sales|108840.0|       S|   87072|9070|
|    E089|Employee_89|         HR|107863.0|       S|   86290|8988|
|    E027|Employee_27|  Marketing|107214.0|       S|   85771|8934|
|    E074|Emplo

#### 최종 과제: 전체 파이프라인 완성

In [55]:
df_report = (
    df
    # Step 1: 컬럼 선택 및 이름 정리
    .select(
        col("emp_id").alias("사원번호"),
        col("name").alias("이름"),
        col("department").alias("부서"),
        col("salary").alias("연봉"),
        col("age").alias("나이")
    )
    # Step 2: 파생 컬럼 추가
    .withColumn(
        "급여등급",
        when(col("연봉") >= 100000, "S")
        .when(col("연봉") >= 80000, "A")
        .when(col("연봉") >= 60000, "B")
        .otherwise("C")
    )
    .withColumn("세후연봉", (col("연봉") * 0.8).cast("int"))
    .withColumn("월급", (col("연봉") / 12).cast("int"))
    # Step 3: 필터링
    .filter(
        (col("급여등급").isin("S", "A")) &
        (col("나이") >= 35) &
        (col("부서").isNotNull())
    )
    # Step 4: 정렬 및 정리
    .orderBy(col("연봉").desc(), col("이름").asc())
    .limit(20)
    .drop("나이")
)

print("=== 최종 인사 리포트 ===")
df_report.show()
print(f"리포트 대상 인원: {df_report.count()}명")

=== 최종 인사 리포트 ===
+--------+------------+-----------+--------+--------+--------+----+
|사원번호|        이름|       부서|    연봉|급여등급|세후연봉|월급|
+--------+------------+-----------+--------+--------+--------+----+
|    E005|  Employee_5|    Finance|118603.0|       S|   94882|9883|
|    E031| Employee_31|  Marketing|115450.0|       S|   92360|9620|
|    E063| Employee_63|    Finance|113530.0|       S|   90824|9460|
|    E013| Employee_13|    Finance|110592.0|       S|   88473|9216|
|    E038| Employee_38|    Finance|109163.0|       S|   87330|9096|
|    E089| Employee_89|         HR|107863.0|       S|   86290|8988|
|    E027| Employee_27|  Marketing|107214.0|       S|   85771|8934|
|    E074| Employee_74|      Sales|107172.0|       S|   85737|8931|
|    E051| Employee_51|         HR|106842.0|       S|   85473|8903|
|    E072| Employee_72|    Finance|105318.0|       S|   84254|8776|
|    E030| Employee_30|         HR|102623.0|       S|   82098|8551|
|    E070| Employee_70|         HR|102003.0|      