# Spark를 활용한 Parquet 파일 살펴보기

## 1. Parquet 파일 불러오기

In [7]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq

from pyspark.sql import SparkSession
from pyspark.sql.functions import first, col, count, sum, round, countDistinct, lit, coalesce, row_number
from pyspark.sql.dataframe import DataFrame as SparkDataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import warnings
warnings.filterwarnings(action='ignore')
pd.set_option("display.max_columns", None)

In [2]:
# SparkSession 생성
spark = SparkSession.builder \
    .appName("ParquetReader") \
    .getOrCreate()

In [3]:
output_folder = os.path.join("data", "parquet_data")

parquet_df = spark.read.parquet(os.path.join(output_folder, "total_merged.parquet"))
parquet_df.show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|2019-11-17 17:43:00|      view|   2501799|2053013564003713919|appliances.kitche...|elenberg|  46.31|563237118|4368d099-6d19-47c...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:00|      view|   6400335|2053013554121933129|computers.compone...|   intel| 435.28|551129779|4db2c365-ee85-443...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:00|      view|   3701538|2053013565983425517|appliances.enviro...|  irobot|1878.81|539845715|bf7d95c0-69e1-

## 2. 데이터 확인

In [4]:
null_counts = parquet_df.agg(
    *[sum(col(c).isNull().cast("int")).alias(c) for c in parquet_df.columns]
)
null_counts.show()

+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|event_time|event_type|product_id|category_id|category_code|   brand|price|user_id|user_session|event_time_ymd|event_time_hms|source_file|
+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|         0|         0|         0|          0|     65171763|55670767|    0|      0|         226|             0|             0|          0|
+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+



In [4]:
# null_counts = parquet_df.select(
#     [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in parquet_df.columns]
# )
# null_counts.show()

In [15]:
# 행의 개수
num_rows = parquet_df.count()

# 열의 개수: parquet_df.columns 리스트의 길이 계산
num_columns = len(parquet_df.columns)

print("DataFrame shape:", (num_rows, num_columns))

DataFrame shape: (411709736, 12)


In [24]:
(65171763 / 411709736) * 100

15.829541373779902

In [6]:
parquet_df.filter(parquet_df.brand == "lucente").show()

+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|category_code|  brand| price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|2019-11-17 17:43:00|      view|  26400266|2053013563651392361|         NULL|lucente|119.18|572211322|8e6c63f8-7f34-48b...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:00|      view|  26203739|2053013563693335403|         NULL|lucente|158.56|527083517|e6bf2cdb-778f-44a...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:01|      cart|  26500142|2053013563550729061|         NULL|lucente|234.76|514929163|f77c9416-abd5-47a...|    2019-11-17|      17:43:01|   2019_nov|
|201

In [None]:
parquet_df.filter((parquet_df.brand == "lucente") & (parquet_df.category_code.isNull())).show()

+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|category_code|  brand| price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|2019-11-17 17:43:00|      view|  26400266|2053013563651392361|         NULL|lucente|119.18|572211322|8e6c63f8-7f34-48b...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:00|      view|  26203739|2053013563693335403|         NULL|lucente|158.56|527083517|e6bf2cdb-778f-44a...|    2019-11-17|      17:43:00|   2019_nov|
|2019-11-17 17:43:01|      cart|  26500142|2053013563550729061|         NULL|lucente|234.76|514929163|f77c9416-abd5-47a...|    2019-11-17|      17:43:01|   2019_nov|
|201

In [11]:
parquet_df.filter((parquet_df.brand == "lucente") & (parquet_df.category_code.isNotNull())).show()

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|2019-12-18 18:58:38|      view|  26400647|2053013553056579841|computers.periphe...|lucente|309.66|588751816|fd905b3f-e928-479...|    2019-12-18|      18:58:38|   2019_dec|
|2019-12-18 18:58:39|      view|  26400266|2053013553056579841|computers.periphe...|lucente|115.83|518170648|1eb255a5-375e-4c5...|    2019-12-18|      18:58:39|   2019_dec|
|2019-12-18 18:58:39|      view|  26500142|2053013553140465927|           kids.toys|lucente|231.15|518868979|86b9a7d4-835b-4f5...|    2

In [18]:
parquet_df.filter((parquet_df.category_id == 2053013563550729061) & (parquet_df.category_code.isNotNull())).show()

+-------------------+----------+----------+-------------------+-------------+------------+------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|category_code|       brand| price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+-------------+------------+------+---------+--------------------+--------------+--------------+-----------+
|2019-12-18 19:00:21|      view|  26500624|2053013563550729061|    sport.ski|        NULL|424.32|551283194|c0610e1a-8e32-451...|    2019-12-18|      19:00:21|   2019_dec|
|2019-12-18 19:01:25|      view|  26500661|2053013563550729061|    sport.ski|     lucente|103.22|525126283|8ad92c9b-2540-41f...|    2019-12-18|      19:01:25|   2019_dec|
|2019-12-18 19:01:31|      view|  26500113|2053013563550729061|    sport.ski|     lucente| 56.63|525126283|8ad92c9b-2540-41f...|    2019-12-18|  

In [19]:
parquet_df.filter((parquet_df.category_id == 2053013563550729061) & (parquet_df.category_code.isNull())).show()

+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|category_code|  brand| price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+-------------+-------+------+---------+--------------------+--------------+--------------+-----------+
|2019-11-17 17:43:01|      cart|  26500142|2053013563550729061|         NULL|lucente|234.76|514929163|f77c9416-abd5-47a...|    2019-11-17|      17:43:01|   2019_nov|
|2019-11-17 17:43:04|      view|  26500148|2053013563550729061|         NULL|lucente| 293.7|533966254|3de1ea8c-22f1-4fa...|    2019-11-17|      17:43:04|   2019_nov|
|2019-11-17 17:43:11|      view|  26500136|2053013563550729061|         NULL|lucente| 91.38|527796278|2df96f32-05b4-4e6...|    2019-11-17|      17:43:11|   2019_nov|
|201

In [22]:
# 특정 category_id 데이터 필터링 후 브랜드별 개수 계산
filtered_counts = (
    parquet_df
    .filter(col("category_id") == 2053013563550729061)  # 특정 category_id 필터링
    .groupBy("brand")  # brand별 그룹화
    .agg(count("*").alias("count"))  # 개수 계산
)

# 전체 개수 계산 (각 브랜드 개수의 총합)
total_count = filtered_counts.agg(sum("count").alias("total")).collect()[0]["total"]

# 각 브랜드별 비율 추가
filtered_counts = (
    filtered_counts
    .withColumn("percent", round((col("count") / total_count) * 100, 2))  # 비율 계산 및 반올림
    .orderBy(col("count").desc())  # 개수 기준 내림차순 정렬
)

# 결과 출력
filtered_counts.show()

+------------------+------+-------+
|             brand| count|percent|
+------------------+------+-------+
|           lucente|284181|  62.16|
|              NULL| 68704|  15.03|
|              vega| 31259|   6.84|
|              jade| 24155|   5.28|
|           sokolov| 13185|   2.88|
|        trollbeads|  9597|    2.1|
|             alkor|  6004|   1.31|
|      robertobravo|  5130|   1.12|
|          dinastia|  3572|   0.78|
|             teosa|  3502|   0.77|
|            adamas|  2681|   0.59|
|          merelani|  1528|   0.33|
|             elite|  1293|   0.28|
|             riche|   906|    0.2|
|        aquamarine|   258|   0.06|
|          babyline|   250|   0.05|
|likatoprofessional|   229|   0.05|
|        xjewellery|   154|   0.03|
|               qvs|   124|   0.03|
|     lucentesilver|   114|   0.02|
+------------------+------+-------+
only showing top 20 rows



In [7]:
# 특정 category_id 데이터 필터링 후 브랜드별 개수 계산
filtered_counts = (
    parquet_df
    .filter(col("category_id") == 2053013563550729061)  # 특정 category_id 필터링
    .groupBy("category_code")  # brand별 그룹화
    .agg(count("*").alias("count"))  # 개수 계산
)

# 전체 개수 계산 (각 브랜드 개수의 총합)
total_count = filtered_counts.agg(sum("count").alias("total")).collect()[0]["total"]

# 각 브랜드별 비율 추가
filtered_counts = (
    filtered_counts
    .withColumn("percent", round((col("count") / total_count) * 100, 2))  # 비율 계산 및 반올림
    .orderBy(col("count").desc())  # 개수 기준 내림차순 정렬
)

# 결과 출력
filtered_counts.show()

+-------------+------+-------+
|category_code| count|percent|
+-------------+------+-------+
|         NULL|393632|   86.1|
|    sport.ski| 63530|   13.9|
+-------------+------+-------+



## 3. 결측치 처리

### 3-1. category_code
- `category_id`는 존재하지만, `category_code`는 결측치인 경우
- `category_id`는 하나의 `category_code`에 부여된다
    - 동일한 `category_id`인데 `category_code`에 값이 들어가 있는 경우, 해당 값으로 결측치 대체

In [None]:
# category_id 별로 category_code를 count
# Null 값을 포함하여 counting 한다 (coalesce() 사용)
# e.g. category_code = NULL, sport.ski → 2개
category_code_counts = (
    parquet_df
    .groupBy("category_id")
    .agg(countDistinct(coalesce(col("category_code"), lit("NULL"))).alias("category_code_count"))
    .filter(col("category_code_count") > 2)  # category_code가 3개 이상인 경우만 필터링
)

category_code_counts.show()

# category_code가 3개 이상인 경우는 없다
# category_code는 최대 2개 존재 (원래 값, NULL)

+-----------+-------------------+
|category_id|category_code_count|
+-----------+-------------------+
+-----------+-------------------+



In [5]:
# 1. category_id별로 NULL이 아닌 첫 번째 category_code 값을 가져오기
filled_category_codes = (
    parquet_df
    .filter(col("category_code").isNotNull())  # NULL이 아닌 값만 사용
    .groupBy("category_id")
    .agg(first("category_code").alias("filled_category_code"))  # 첫 번째 category_code 값 추출
)

# 2️. 기존 데이터와 조인 후, NULL 값 채우기
parquet_filled_cc = (
    parquet_df
    .join(filled_category_codes, on="category_id", how="left")  # 동일한 category_id를 기준으로 병합
    .withColumn("category_code", coalesce(col("category_code"), col("filled_category_code")))  # NULL이면 대체
    .drop("filled_category_code")  # 불필요한 컬럼 제거
)

# 3️. 결과 확인
parquet_filled_cc.show()

+-------------------+-------------------+----------+----------+--------------------+---------+-------+---------+--------------------+--------------+--------------+-----------+
|        category_id|         event_time|event_type|product_id|       category_code|    brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+-------------------+----------+----------+--------------------+---------+-------+---------+--------------------+--------------+--------------+-----------+
|2053013563944993659|2019-11-17 17:43:00|      view|   4600560|appliances.kitche...|     beko|  412.4|522329355|dce61941-af79-4fd...|    2019-11-17|      17:43:00|   2019_nov|
|2053013566209917945|2019-11-17 17:43:00|      view|  28401077|     accessories.bag|  respect|   39.9|512757661|4c6f8f63-a612-4c5...|    2019-11-17|      17:43:00|   2019_nov|
|2053013555631882655|2019-11-17 17:43:00|      view|   1004659|electronics.smart...|  samsung| 762.18|512965259|2981c9f9

In [5]:
null_counts_cc_filled = parquet_filled_cc.select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in parquet_filled.columns]
)
null_counts_cc_filled.show()

+-----------+----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|category_id|event_time|event_type|product_id|category_code|   brand|price|user_id|user_session|event_time_ymd|event_time_hms|source_file|
+-----------+----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|          0|         0|         0|         0|     42378518|55670767|    0|      0|         226|             0|             0|          0|
+-----------+----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+



### 3-2. Brand
- `product_id`는 존재하지만, `brand`는 결측치인 경우
- 동일한 `product_id`를 사용하지만, `brand`가 여러개인 경우가 있다
    - Null 값을 포함하여, brand가 3개이거나 4개인 `product_id`도 존재
- `product_id`별로 가장 많이 관찰되는 `brand`로 해당 `product_id`의 brand 결측치를 대체

In [5]:
# product_id 별로 brand를 count
# Null 값을 포함하여 counting 한다 (coalesce() 사용)
# e.g. brand = NULL, sport.ski → 2개
brand_counts = (
    parquet_df
    .groupBy("product_id")
    .agg(countDistinct(coalesce(col("brand"), lit("NULL"))).alias("brand_count"))
    .filter(col("brand_count") > 2)  # brand가 3개 이상인 경우만 필터링
    .orderBy(col("brand_count").desc())
)

brand_counts.show()

+----------+-----------+
|product_id|brand_count|
+----------+-----------+
|   7204010|          4|
|   5301778|          4|
| 100165803|          3|
|  27701269|          3|
|  26800040|          3|
|  17303216|          3|
| 100181762|          3|
| 100069005|          3|
| 100021095|          3|
|  13300909|          3|
|  26500351|          3|
|  17301178|          3|
| 100148888|          3|
|  34800369|          3|
| 100170517|          3|
|   4700562|          3|
|  10701054|          3|
|   6902632|          3|
|  51000033|          3|
|   1801294|          3|
+----------+-----------+
only showing top 20 rows



In [22]:
# 특정 product_id 데이터 필터링 후 브랜드별 개수 계산
filtered_counts_product = (
    parquet_df
    .filter(col("product_id") == 1000978)  # 특정 product_id 필터링
    .groupBy("brand")  # brand별 그룹화
    .agg(count("*").alias("count"))  # 개수 계산
)

# 전체 개수 계산 (각 브랜드 개수의 총합)
total_count = filtered_counts_product.agg(sum("count").alias("total")).collect()[0]["total"]

# 각 브랜드별 비율 추가
filtered_counts_product = (
    filtered_counts_product
    .withColumn("percent", round((col("count") / total_count) * 100, 2))  # 비율 계산 및 반올림
    .orderBy(col("count").desc())  # 개수 기준 내림차순 정렬
)

# 결과 출력
filtered_counts_product.show()

+-------+-----+-------+
|  brand|count|percent|
+-------+-----+-------+
|samsung|22484|  99.69|
|   NULL|   71|   0.31|
+-------+-----+-------+



In [6]:
# 1️. product_id별 brand 개수 세기
brand_counts = (
    parquet_filled_cc
    .groupBy("product_id", "brand")  # product_id, brand 기준으로 그룹화
    .agg(count("*").alias("brand_count"))  # 각 브랜드가 등장한 횟수 계산
    .orderBy(col("product_id"), col("brand_count").desc())  # 같은 product_id 내에서 count 기준 정렬
)

# 2️. product_id별 최빈 brand 선택
window_spec = Window.partitionBy("product_id").orderBy(col("brand_count").desc())

most_frequent_brands = (
    brand_counts
    .withColumn("rank", row_number().over(window_spec))  # 가장 많이 등장한 브랜드 찾기
    .filter(col("rank") == 1)  # 최빈 브랜드만 선택
    .drop("brand_count", "rank")  # 불필요한 컬럼 제거
    .withColumnRenamed("brand", "most_frequent_brand")  # 컬럼 이름 변경하여 충돌 방지
)

# 3️. 기존 데이터와 조인 후, NULL 값 채우기
parquet_filled_cc_br = (
    parquet_filled_cc
    .join(most_frequent_brands, on="product_id", how="left")  # product_id 기준으로 병합
    .withColumn("brand", coalesce(col("brand"), col("most_frequent_brand")))  # NULL 값 대체
    .drop("most_frequent_brand")  # 불필요한 컬럼 제거
)

# 4️⃣ 결과 확인
parquet_filled_cc_br.show()


+----------+-------------------+-------------------+----------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|product_id|        category_id|         event_time|event_type|       category_code|   brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+----------+-------------------+-------------------+----------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|   1005021|2053013555631882655|2019-11-17 17:43:00|      view|electronics.smart...|    oppo| 386.08|512887550|3c3af822-9816-434...|    2019-11-17|      17:43:00|   2019_nov|
|   5700981|2053013553970938175|2019-11-17 17:43:00|      view|auto.accessories....|  alpine| 875.18|558414772|14c5b3c8-3c0a-4bd...|    2019-11-17|      17:43:00|   2019_nov|
|   1004249|2053013555631882655|2019-11-17 17:43:00|  purchase|electronics.smart...|   apple| 765.79|562839858|98c3adb8-a028-

In [7]:
null_counts = parquet_filled_cc_br.agg(
    *[sum(col(c).isNull().cast("int")).alias(c) for c in parquet_filled_cc_br.columns]
)
null_counts.show()

+----------+-----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|product_id|category_id|event_time|event_type|category_code|   brand|price|user_id|user_session|event_time_ymd|event_time_hms|source_file|
+----------+-----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+
|         0|          0|         0|         0|     42378518|53236401|    0|      0|         226|             0|             0|          0|
+----------+-----------+----------+----------+-------------+--------+-----+-------+------------+--------------+--------------+-----------+



In [8]:
# 전체 데이터에서 category_code의 결측치 비율
(42378518 / 411709736) * 100

10.293299937896053

In [8]:
# 전체 데이터에서 brand의 결측치 비율
(53236401 / 411709736) * 100

12.930566451311707