In [13]:
# -----------------------------------------------------------------------------
# 환경 설정: PySpark 및 데이터 로드
# -----------------------------------------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, when, count, sum, avg, min, max,
    round as spark_round, expr,
    upper, lower, trim, concat, concat_ws, substring, length,
    to_date, year, month, dayofmonth, datediff, current_date,
    row_number, rank, dense_rank, lag, lead,
    coalesce, countDistinct, first, last,mean
)
from pyspark.sql.window import Window
import seaborn as sns
import pandas as pd

# SparkSession 생성
spark = SparkSession.builder \
    .appName("PySpark-DataFrame-Exercise") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", 10) \
    .getOrCreate()

# -----------------------------------------------------------------------------
# 실제 데이터 로드: Seaborn 내장 데이터셋
# -----------------------------------------------------------------------------

# Tips 데이터셋: 레스토랑 팁 기록
tips_pdf = sns.load_dataset("tips")
df_tips = spark.createDataFrame(tips_pdf)

# Titanic 데이터셋: 타이타닉 승객 생존 기록
titanic_pdf = sns.load_dataset("titanic")
df_titanic = spark.createDataFrame(titanic_pdf)

print("=" * 60)
print("데이터 로드 완료!")
print("=" * 60)
print(f"\n[Tips 데이터]")
print(f"- 행 수: {df_tips.count()}")
print(f"- 컬럼: {df_tips.columns}")
df_tips.show(5)

print(f"\n[Titanic 데이터]")
print(f"- 행 수: {df_titanic.count()}")
print(f"- 컬럼: {df_titanic.columns}")
df_titanic.show(5)

데이터 로드 완료!

[Tips 데이터]
- 행 수: 244
- 컬럼: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows


[Titanic 데이터]
- 행 수: 891
- 컬럼: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+
|survived|pclass|   sex| age|sibsp|parch|   fare|embarked|class|  who|adult_male|deck|embark_town|alive|alone|
+--------+------+------+----+-----+-----+

#### Part 1: 데이터 탐색

In [5]:
df_tips.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [6]:
df_tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



#### 문제 1-2: 기본 통계 확인

In [7]:
df = df_tips

In [24]:
from pyspark.sql.functions import (stddev, col)

In [22]:
col2 = "tip"

In [23]:
df_stats = df.agg(
    count("*").alias("팁갯수"),
    mean(col2).alias("평균금액"),
    stddev(col2).alias("팁표준편차"),
    min(col2).alias("최소팁"),
    max(col2).alias("최대팁"),
)
df_stats.show()

+------+------------------+------------------+------+------+
|팁갯수|          평균금액|        팁표준편차|최소팁|최대팁|
+------+------------------+------------------+------+------+
|   244|2.9982786885245893|1.3836381890011817|   1.0|  10.0|
+------+------------------+------------------+------+------+



In [20]:
df.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950817|2.9982786885245893|  NULL|  NULL|NULL|  NULL| 2.569672131147541|
| stddev| 8.902411954856857|1.3836381890011817|  NULL|  NULL|NULL|  NULL|0.9510998047322345|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



#### 문제 2-1: 컬럼 선택

In [26]:
df.select("total_bill","tip","day").show()

+----------+----+---+
|total_bill| tip|day|
+----------+----+---+
|     16.99|1.01|Sun|
|     10.34|1.66|Sun|
|     21.01| 3.5|Sun|
|     23.68|3.31|Sun|
|     24.59|3.61|Sun|
|     25.29|4.71|Sun|
|      8.77| 2.0|Sun|
|     26.88|3.12|Sun|
|     15.04|1.96|Sun|
|     14.78|3.23|Sun|
|     10.27|1.71|Sun|
|     35.26| 5.0|Sun|
|     15.42|1.57|Sun|
|     18.43| 3.0|Sun|
|     14.83|3.02|Sun|
|     21.58|3.92|Sun|
|     10.33|1.67|Sun|
|     16.29|3.71|Sun|
|     16.97| 3.5|Sun|
|     20.65|3.35|Sat|
+----------+----+---+
only showing top 20 rows



#### 문제 2-2: 새 컬럼 추가 (팁 비율)

In [28]:
df_with_monthly = df.withColumn(
    "tip_rate",       # 새 컬럼 이름
    spark_round(col("tip") /col("total_bill"),2)      # 계산식
).show(4)

+----------+----+------+------+---+------+----+--------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_rate|
+----------+----+------+------+---+------+----+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    0.06|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.16|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.17|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.14|
+----------+----+------+------+---+------+----+--------+
only showing top 4 rows



#### 문제 2-3: 컬럼 이름 변경

In [30]:
df_renamed = df_multi_renamed = (
    df
    .withColumnRenamed("total_bill","bill_amount")
    .withColumnRenamed("tip", "tip_amount")
)
df_renamed.show()

+-----------+----------+------+------+---+------+----+
|bill_amount|tip_amount|   sex|smoker|day|  time|size|
+-----------+----------+------+------+---+------+----+
|      16.99|      1.01|Female|    No|Sun|Dinner|   2|
|      10.34|      1.66|  Male|    No|Sun|Dinner|   3|
|      21.01|       3.5|  Male|    No|Sun|Dinner|   3|
|      23.68|      3.31|  Male|    No|Sun|Dinner|   2|
|      24.59|      3.61|Female|    No|Sun|Dinner|   4|
|      25.29|      4.71|  Male|    No|Sun|Dinner|   4|
|       8.77|       2.0|  Male|    No|Sun|Dinner|   2|
|      26.88|      3.12|  Male|    No|Sun|Dinner|   4|
|      15.04|      1.96|  Male|    No|Sun|Dinner|   2|
|      14.78|      3.23|  Male|    No|Sun|Dinner|   2|
|      10.27|      1.71|  Male|    No|Sun|Dinner|   2|
|      35.26|       5.0|Female|    No|Sun|Dinner|   4|
|      15.42|      1.57|  Male|    No|Sun|Dinner|   2|
|      18.43|       3.0|  Male|    No|Sun|Dinner|   4|
|      14.83|      3.02|Female|    No|Sun|Dinner|   2|
|      21.

#### 문제 2-4: 상수 컬럼 추가

In [33]:
df_currency = df.withColumn("currency",lit("USD"))
df_currency.show(4)

+----------+----+------+------+---+------+----+--------+
|total_bill| tip|   sex|smoker|day|  time|size|currency|
+----------+----+------+------+---+------+----+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|     USD|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|     USD|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|     USD|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|     USD|
+----------+----+------+------+---+------+----+--------+
only showing top 4 rows



#### 문제 3-1: 단순 필터링

In [34]:
df_total_bill = df.filter(
    (col("total_bill") >= 20)
)
df_total_bill.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



#### 문제 3-2: AND 조건 필터링

In [35]:
df_total_bill_th = df.filter(
    (col("total_bill") >= 20) & (col("tip") >= 3)
)
df_total_bill_th.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



#### 문제 3-3: OR 조건 필터링

In [36]:
df.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [None]:
filtered_day = df.filter(
    (col("day") == "Sat") | (col("day") == "Sun")
)

filtered_day.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     20.65|3.35|  Male|    No|Sat|Dinner|   3|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|
|     20.29|2.75|Female|    No|Sat|Dinner|   2|
|     15.77|2.23|Female|    No|Sat|Dinner|   2|
|     39.42|7.58|  Male|    No|Sat|Dinner|   4|
|     19.82|3.18|  Male|    No|Sat|Dinner|   2|
|     17.81|2.34|  Male|    No|Sat|Dinner|   4|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|
|      21.7| 4.3|  Male|    No|Sat|Dinner|   2|
|     19.65| 3.0|Female|    No|Sat|Dinner|   2|
|      9.55|1.45|  Male|    No|Sat|Dinner|   2|
|     18.35| 2.5|  Male|    No|Sat|Dinner|   4|
|     15.06| 3.0|Female|    No|Sat|Dinner|   2|
|     20.69|2.45|Female|    No|Sat|Dinner|   4|
|     17.78|3.27|  Male|    No|Sat|Dinner|   2|
|     24.06| 3.6|  Male|    No|Sat|Dinner|   3|
|     16.31| 2.0|  Male|    No|Sat|Dinne

#### 문제 3-4: 문자열 필터링

In [44]:
start_south = df_titanic.filter(
    col("embark_town").startswith("South")
)

start_south.show(5)

+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+
|survived|pclass|   sex| age|sibsp|parch|   fare|embarked|class|  who|adult_male|deck|embark_town|alive|alone|
+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|Third|  man|      true| NaN|Southampton|   no|false|
|       1|     3|female|26.0|    0|    0|  7.925|       S|Third|woman|     false| NaN|Southampton|  yes| true|
|       1|     1|female|35.0|    1|    0|   53.1|       S|First|woman|     false|   C|Southampton|  yes|false|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|Third|  man|      true| NaN|Southampton|   no| true|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|First|  man|      true|   E|Southampton|   no| true|
+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+
o

#### 문제 4-1: 단순 조건 분기

In [None]:
df_tip_quality = df.withColumn(
    "tip_quality",
    # when(조건, 참일때값).otherwise(거짓일때값)
    when(col("tip") >= 5, "Good").otherwise("Normal")
)

df_tip_quality.filter(col("tip") >= 5).show(5)

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_quality|
+----------+----+------+------+---+------+----+-----------+
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|       Good|
|     39.42|7.58|  Male|    No|Sat|Dinner|   4|       Good|
|     31.27| 5.0|  Male|    No|Sat|Dinner|   3|       Good|
|      30.4| 5.6|  Male|    No|Sun|Dinner|   4|       Good|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|       Good|
+----------+----+------+------+---+------+----+-----------+
only showing top 5 rows



#### 문제 4-2: 다중 조건 분기

In [48]:
df_bill_category = df.withColumn(
    "bill_category",
    # when(조건, 참일때값).otherwise(거짓일때값)
    when(col("total_bill") >= 30, "High")
    .when(col("total_bill") >= 20, "Medium")
    .otherwise("Low")
)

df_bill_category.show(5)

+----------+----+------+------+---+------+----+-------------+
|total_bill| tip|   sex|smoker|day|  time|size|bill_category|
+----------+----+------+------+---+------+----+-------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|          Low|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|          Low|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|       Medium|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|       Medium|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|       Medium|
+----------+----+------+------+---+------+----+-------------+
only showing top 5 rows



#### 문제 4-3: 복합 조건 분기

In [51]:
df_priority = df_titanic.withColumn(
    "priority",
    # when(조건, 참일때값).otherwise(거짓일때값)
    when((col("pclass")  == 1) & (col("sex") == "female"),"Highest")
    .when(col("sex") == "female", "High")
    .when(col("age") < 12, "High")
    .otherwise("Normal")
)

df_priority.show(4)

# 우선순위별 생존율 확인
df_priority.groupBy("priority").agg(
    count("*").alias("total"),
    sum("survived").alias("survived"),
    spark_round(avg("survived"), 2).alias("survival_rate")
).show()

+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+--------+
|survived|pclass|   sex| age|sibsp|parch|   fare|embarked|class|  who|adult_male|deck|embark_town|alive|alone|priority|
+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|Third|  man|      true| NaN|Southampton|   no|false|  Normal|
|       1|     1|female|38.0|    1|    0|71.2833|       C|First|woman|     false|   C|  Cherbourg|  yes|false| Highest|
|       1|     3|female|26.0|    0|    0|  7.925|       S|Third|woman|     false| NaN|Southampton|  yes| true|    High|
|       1|     1|female|35.0|    1|    0|   53.1|       S|First|woman|     false|   C|Southampton|  yes|false| Highest|
+--------+------+------+----+-----+-----+-------+--------+-----+-----+----------+----+-----------+-----+-----+--------+
only showing top 4 rows

+--------+-----

#### 문제 5-1: 정렬