# Spark를 활용한 ecommerce 유저 행동 데이터 parquet 파일 변환 및 저장
- [kaggle 데이터셋 링크](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store)

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq

from pyspark.sql import SparkSession
# from pyspark.sql.functions import to_timestamp, date_format, col, unix_timestamp
from pyspark.sql.dataframe import DataFrame as SparkDataFrame
from pyspark.sql import functions as F
# from pyspark.sql.functions import lit

import warnings
warnings.filterwarnings(action='ignore')
pd.set_option("display.max_columns", None)

## 1. 데이터 불러오기

In [2]:
raw_files = os.listdir("data/raw_data")
raw_files

['2019-Dec.csv',
 '2019-Nov.csv',
 '2019-Oct.csv',
 '2020-Apr.csv',
 '2020-Feb.csv',
 '2020-Jan.csv',
 '2020-Mar.csv']

### 1-1. 데이터프레임을 dictionary로 변경

In [3]:
def load_csv_files_to_dict(spark, folder_path):
    """
    지정된 폴더 내의 모든 CSV 파일을 읽어와서,
    파일명(key)과 DataFrame(value)의 dictionary를 생성
    """
    # CSV 파일 목록 생성 (확장자가 .csv인 파일 필터링)
    # csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    csv_files = os.listdir(folder_path)
    
    # 결과를 저장할 dictionary 초기화
    csv_dict = {}
    
    # 각 CSV 파일을 읽어와 dictionary에 저장
    for file_name in tqdm(csv_files, desc="Processing CSV files"):
        file_path = os.path.join(folder_path, file_name)
        
        df = spark.read \
            .option("header", True) \
            .option("inferSchema", True) \
            .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
            .csv(file_path)
        
        raw_df_name = file_name.replace("-", "_").replace(".csv", "").lower()
        csv_dict[raw_df_name] = df
        # DataFrame 내용 확인 (첫 5행 출력)
        # df.show(5)
    
    return csv_dict

In [4]:
# SparkSession 생성
spark = SparkSession.builder \
    .appName("MultiCSVReader") \
    .getOrCreate()

print("Current TimeZone:", spark.conf.get("spark.sql.session.timeZone"))

# SparkSession Time Zone을 UTC로 설정
spark.conf.set("spark.sql.session.timeZone", "UTC")

# 함수 호출하여 dictionary 생성
csv_dict = load_csv_files_to_dict(spark, "data/raw_data")

# 생성된 dictionary의 key (파일명) 출력
print("\nCSV 파일 Dictionary keys:", list(csv_dict.keys()))

Current TimeZone: Asia/Seoul


Processing CSV files: 100%|██████████| 7/7 [02:32<00:00, 21.85s/it]


CSV 파일 Dictionary keys: ['2019_dec', '2019_nov', '2019_oct', '2020_apr', '2020_feb', '2020_jan', '2020_mar']





In [5]:
# Asia/Seoul에서 UTC로 바뀐 Time Zone 확인
print("Current TimeZone:", spark.conf.get("spark.sql.session.timeZone"))

Current TimeZone: UTC


### 1-2. 불러온 데이터셋 확인

In [6]:
csv_dict["2019_oct"].printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [7]:
csv_dict["2019_oct"].show(3)

+--------------------+----------+----------+-------------------+--------------------+--------+-----+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand|price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+-----+---------+--------------------+
|2019-10-01 00:00:...|      view|  44600062|2103807459595387724|                NULL|shiseido|35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:...|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua| 33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:...|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|543.1|519107250|566511c2-e2e3-422...|
+--------------------+----------+----------+-------------------+--------------------+--------+-----+---------+--------------------+
only showing top 3 rows



In [8]:
csv_dict["2019_nov"].show(3)

+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:...|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:...|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:...|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
+--------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
only showing top 3 rows



In [9]:
csv_dict["2019_dec"].show(3)

+--------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|brand|  price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+
|2019-12-01 00:00:...|      view|   1005105|2232732093077520756|construction.tool...|apple|1302.48|556695836|ca5eefc5-11f9-450...|
|2019-12-01 00:00:...|      view|  22700068|2232732091643068746|                NULL|force| 102.96|577702456|de33debe-c7bf-44e...|
|2019-12-01 00:00:...|      view|   2402273|2232732100769874463|appliances.person...|bosch| 313.52|539453785|5ee185a7-0689-4a3...|
+--------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+
only showing top 3 rows



In [11]:
null_counts_oct = csv_dict["2019_oct"].select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in csv_dict["2019_oct"].columns]
)
null_counts_oct.show()

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     13515609|6113008|    0|      0|           2|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



In [21]:
# 행의 개수
num_rows_oct = csv_dict["2019_oct"].count()

# 열의 개수: csv_dict["2019_oct"].columns 리스트의 길이 계산
num_columns_oct = len(csv_dict["2019_oct"].columns)

print("DataFrame shape:", (num_rows_oct, num_columns_oct))


DataFrame shape: (42448764, 11)


In [12]:
null_counts_apr = csv_dict["2020_apr"].select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in csv_dict["2020_apr"].columns]
)
null_counts_apr.show()

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|      6755873|8985057|    0|      0|         109|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



In [14]:
# 행의 개수
num_rows_apr = csv_dict["2020_apr"].count()

# 열의 개수: csv_dict["2020_apr"].columns 리스트의 길이 계산
num_columns_apr = len(csv_dict["2020_apr"].columns)

print("DataFrame shape:", (num_rows_apr, num_columns_apr))


DataFrame shape: (66589268, 9)


## 2. 파생 컬럼 생성
- `event_time` 컬럼의 형태: `2019-12-01 09:00:00`
- 이 컬럼을 날짜와 시간 컬럼으로 각각 분리

In [6]:
def separate_event_time(csv_dict: dict, event_time_col: str = "event_time") -> dict:
    """
    Spark DataFrame의 event_time 컬럼을 timestamp로 변환 후, 
    날짜(yyyy-MM-dd)와 시간(HH:mm:ss) 컬럼으로 분리하여 csv_dict의 각 DataFrame을 업데이트
    """
    for key in tqdm(csv_dict.keys(), desc="Seperating Columns"):
        df: SparkDataFrame = csv_dict[key]
        
        # 1. event_time 컬럼을 to_timestamp()를 사용하여 timestamp 타입으로 변환
        df = df.withColumn(event_time_col, F.to_timestamp(F.col(event_time_col)))
        
        # 2. 날짜 부분 추출: date_format()을 사용하여 "yyyy-MM-dd" 형식으로 날짜 추출
        df = df.withColumn("event_time_ymd", F.date_format(F.col(event_time_col), "yyyy-MM-dd"))
        
        # 3. 시간 부분 추출: date_format()을 사용하여 "HH:mm:ss" 형식으로 시간 추출
        df = df.withColumn("event_time_hms", F.date_format(F.col(event_time_col), "HH:mm:ss"))
        
        # 수정된 DataFrame을 dictionary에 업데이트
        csv_dict[key] = df
        
    return csv_dict

In [7]:
csv_dict_seperated = separate_event_time(csv_dict)

Seperating Columns: 100%|██████████| 7/7 [00:00<00:00, 38.11it/s]


In [8]:
csv_dict_seperated.keys()

dict_keys(['2019_dec', '2019_nov', '2019_oct', '2020_apr', '2020_feb', '2020_jan', '2020_mar'])

In [9]:
csv_dict_seperated["2019_dec"].printSchema()


root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- event_time_ymd: string (nullable = true)
 |-- event_time_hms: string (nullable = true)



In [14]:
csv_dict_seperated["2019_dec"].show(3)
# csv_dict_seperated["2019_dec"].select("event_time").show(10, truncate=False)


+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+--------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|
+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+--------------+--------------+
|2019-12-01 00:00:00|      view|   1005105|2232732093077520756|construction.tool...|apple|1302.48|556695836|ca5eefc5-11f9-450...|    2019-12-01|      00:00:00|
|2019-12-01 00:00:00|      view|  22700068|2232732091643068746|                NULL|force| 102.96|577702456|de33debe-c7bf-44e...|    2019-12-01|      00:00:00|
|2019-12-01 00:00:01|      view|   2402273|2232732100769874463|appliances.person...|bosch| 313.52|539453785|5ee185a7-0689-4a3...|    2019-12-01|      00:00:01|
+-------------------+----------+--------

In [52]:
csv_dict_seperated["2019_dec"].orderBy("event_time").show(3)

+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+--------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|
+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+--------------+--------------+
|2019-12-01 00:00:00|      view|   1005105|2232732093077520756|construction.tool...|apple|1302.48|556695836|ca5eefc5-11f9-450...|    2019-12-01|      00:00:00|
|2019-12-01 00:00:00|      view|  22700068|2232732091643068746|                NULL|force| 102.96|577702456|de33debe-c7bf-44e...|    2019-12-01|      00:00:00|
|2019-12-01 00:00:01|      view|   2402273|2232732100769874463|appliances.person...|bosch| 313.52|539453785|5ee185a7-0689-4a3...|    2019-12-01|      00:00:01|
+-------------------+----------+--------

In [53]:
csv_dict_seperated["2020_jan"].show(3)

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+--------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+--------------+--------------+
|2020-01-01 00:00:00|      view|   1005073|2232732093077520756|construction.tool...|samsung|1130.02|519698804|69b5d72f-fd6e-4fe...|    2020-01-01|      00:00:00|
|2020-01-01 00:00:01|      view|   1005192|2232732093077520756|construction.tool...|  meizu| 205.67|527767423|7f596032-ccbf-464...|    2020-01-01|      00:00:01|
|2020-01-01 00:00:01|      view| 100063693|2053013552427434207|       apparel.shirt| turtle| 136.43|519046195|d1e2f343-84bb-49b...|    2020-01-01|      00:00:01|
+-------------------+-------

In [54]:
csv_dict_seperated["2020_apr"].show(3)

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+--------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+--------------+--------------+
|2020-04-01 00:00:00|      view|   1201465|2232732101407408685|apparel.shoes.sli...|samsung| 230.38|568984877|e2456cef-2d4f-42b...|    2020-04-01|      00:00:00|
|2020-04-01 00:00:01|      view|   1307156|2053013554658804075|electronics.audio...|  apple|1352.67|514955500|38f43134-de83-471...|    2020-04-01|      00:00:01|
|2020-04-01 00:00:01|      view|   1480477|2053013563835941749|appliances.kitche...|  apple|1184.05|633645770|16aba270-b3c2-4b2...|    2020-04-01|      00:00:01|
+-------------------+-------

## 3. 데이터셋 parquet 저장

### 3-1. 변환된 parquet 파일 저장 및 확인

In [15]:
def save_csv_dict_to_parquet(csv_dict: dict, output_folder: str) -> None:
    """
    딕셔너리의 각 Spark DataFrame을 parquet 파일로 저장
    """
    # output_folder가 존재하지 않으면 생성
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # 딕셔너리의 각 key-value 쌍에 대해 처리
    for key, df in tqdm(csv_dict.items(), desc="Saving Parquet Files"):

        parquet_file = key + "_parquet"
        output_path = os.path.join(output_folder, parquet_file)
        
        # DataFrame을 parquet 파일로 저장 (overwrite 모드)
        df.write.mode("overwrite").parquet(output_path)
        
        print(f"{key} 데이터프레임이 {output_path}로 저장되었습니다.")

In [16]:
output_folder = os.path.join("data", "parquet_data")
save_csv_dict_to_parquet(csv_dict_seperated, output_folder)

Saving Parquet Files:  14%|█▍        | 1/7 [01:47<10:42, 107.03s/it]

2019_dec 데이터프레임이 data\parquet_data\2019_dec_parquet로 저장되었습니다.


Saving Parquet Files:  29%|██▊       | 2/7 [03:32<08:49, 105.93s/it]

2019_nov 데이터프레임이 data\parquet_data\2019_nov_parquet로 저장되었습니다.


Saving Parquet Files:  43%|████▎     | 3/7 [04:37<05:48, 87.16s/it] 

2019_oct 데이터프레임이 data\parquet_data\2019_oct_parquet로 저장되었습니다.


Saving Parquet Files:  57%|█████▋    | 4/7 [06:19<04:39, 93.22s/it]

2020_apr 데이터프레임이 data\parquet_data\2020_apr_parquet로 저장되었습니다.


Saving Parquet Files:  71%|███████▏  | 5/7 [07:47<03:02, 91.36s/it]

2020_feb 데이터프레임이 data\parquet_data\2020_feb_parquet로 저장되었습니다.


Saving Parquet Files:  86%|████████▌ | 6/7 [09:16<01:30, 90.53s/it]

2020_jan 데이터프레임이 data\parquet_data\2020_jan_parquet로 저장되었습니다.


Saving Parquet Files: 100%|██████████| 7/7 [10:45<00:00, 92.15s/it]

2020_mar 데이터프레임이 data\parquet_data\2020_mar_parquet로 저장되었습니다.





In [17]:
parquet_2019_oct = spark.read.parquet(os.path.join(output_folder, "2019_oct_parquet"))
parquet_2019_oct.show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+
|2019-10-13 06:25:46|      view|   1002544|2053013555631882655|electronics.smart...|   apple| 460.51|518958788|e7e27c5c-1e78-481...|    2019-10-13|      06:25:46|
|2019-10-13 06:25:46|      view|   3700301|2053013565983425517|appliances.enviro...|   vitek| 120.93|557977070|7afc206c-7259-4be...|    2019-10-13|      06:25:46|
|2019-10-13 06:25:46|      view|  49100004|2127425375913902544|                NULL|    NULL|  45.05|514456508|9d6837a5-40df-49d...|    2019-10-13|      06:25:46|
|2019-10-13 06:25:46| 

In [22]:
print("원본 2019_oct DataFrame shape:", (num_rows_oct, num_columns_oct))


# 행의 개수
num_rows_parquet_2019_oct = parquet_2019_oct.count()

# 열의 개수: parquet_2019_oct.columns 리스트의 길이 계산
num_columns_parquet_2019_oct = len(parquet_2019_oct.columns)

print("Parquet으로 저장된 2019_oct DataFrame shape:", (num_rows_parquet_2019_oct, num_columns_parquet_2019_oct))

원본 2019_oct DataFrame shape: (42448764, 11)
Parquet으로 저장된 2019_oct DataFrame shape: (42448764, 11)


### 3-2. parquet 파일 하나로 합치기

In [None]:
def save_merged_csv_dict_to_parquet(csv_dict: dict, output_folder: str, spark: SparkSession) -> None:
    """
    여러 Spark DataFrame을 하나의 DataFrame으로 합쳐 Parquet 파일로 저장하고, Merge가 제대로 되었는지 확인
    """
    # 모든 데이터프레임을 리스트로 수집
    df_list = []
    total_rows = 0  # 병합 전 총 행 수
    column_counts = {}  # 개별 데이터프레임의 컬럼 개수 저장

    print("\n===== 개별 DataFrame 정보 =====")
    
    for key, df in tqdm(csv_dict.items(), desc="Appending DataFrames"):
        row_count = df.count()  # 개별 DataFrame의 행 수
        col_count = len(df.columns)  # 개별 DataFrame의 열 수
        total_rows += row_count  # 전체 행 수 누적

        # 개별 DataFrame shape 저장
        column_counts[key] = (row_count, col_count)

        # key별 shape 출력
        print(f"{key}: {row_count} rows, {col_count} columns")

        # 원본 파일명을 'source_file' 컬럼으로 추가 (데이터 출처 확인 가능)
        df = df.withColumn("source_file", F.lit(key))
        df_list.append(df)

    # 데이터프레임 리스트를 하나로 병합
    merged_df = df_list[0]
    for df in tqdm(df_list[1:], desc="Merging DataFrames"):
        merged_df = merged_df.unionByName(df, allowMissingColumns=True)

    # 병합 후 최종 DataFrame의 shape
    merged_rows = merged_df.count()
    merged_columns = len(merged_df.columns)

    # 저장할 Parquet 파일 경로
    output_path = os.path.join(output_folder, "total_merged_parquet")

    # Parquet 파일로 저장 (overwrite 모드)
    merged_df.write.mode("overwrite").parquet(output_path)

    # Merge가 제대로 되었는지 확인
    print("\n============== Merge 결과 ==============")
    print(f"개별 DataFrame 총 행 수 합계: {total_rows}, 병합된 DataFrame 행 수: {merged_rows}")
    print(f"개별 DataFrame 최대 컬럼 수: {max(column_counts.values(), key=lambda x: x[1])[1]}, 병합된 DataFrame 컬럼 수: {merged_columns}\n")
    
    if total_rows == merged_rows:
        print("병합이 정상적으로 이루어졌습니다!")
    else:
        print("병합된 행 수가 개별 행 수 총합과 일치하지 않습니다. 데이터 손실 가능성 존재!")

    print(f"\n모든 데이터프레임이 병합되어 {output_path}에 저장되었습니다.\n")

In [19]:
output_folder = os.path.join("data", "parquet_data")
save_merged_csv_dict_to_parquet(csv_dict_seperated, output_folder, spark)


===== 개별 DataFrame 정보 =====


Appending DataFrames:  14%|█▍        | 1/7 [00:06<00:41,  6.97s/it]

2019_dec: 67542878 rows, 11 columns


Appending DataFrames:  29%|██▊       | 2/7 [00:13<00:33,  6.65s/it]

2019_nov: 67501979 rows, 11 columns


Appending DataFrames:  43%|████▎     | 3/7 [00:17<00:21,  5.37s/it]

2019_oct: 42448764 rows, 11 columns


Appending DataFrames:  57%|█████▋    | 4/7 [00:23<00:17,  5.69s/it]

2020_apr: 66589268 rows, 11 columns


Appending DataFrames:  71%|███████▏  | 5/7 [00:28<00:10,  5.50s/it]

2020_feb: 55318565 rows, 11 columns


Appending DataFrames:  86%|████████▌ | 6/7 [00:33<00:05,  5.39s/it]

2020_jan: 55967041 rows, 11 columns


Appending DataFrames: 100%|██████████| 7/7 [00:39<00:00,  5.61s/it]


2020_mar: 56341241 rows, 11 columns


Merging DataFrames: 100%|██████████| 6/6 [00:00<00:00, 120.00it/s]



개별 DataFrame 총 행 수 합계: 411709736, 병합된 DataFrame 행 수: 411709736
개별 DataFrame 최대 컬럼 수: 11, 병합된 DataFrame 컬럼 수: 12

병합이 정상적으로 이루어졌습니다!

모든 데이터프레임이 병합되어 data\parquet_data\total_merged_parquet에 저장되었습니다.



In [20]:
parquet_df = spark.read.parquet(os.path.join(output_folder, "total_merged_parquet"))
parquet_df.show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|2019-11-17 08:43:00|      view|   2501799|2053013564003713919|appliances.kitche...|elenberg|  46.31|563237118|4368d099-6d19-47c...|    2019-11-17|      08:43:00|   2019_nov|
|2019-11-17 08:43:00|      view|   6400335|2053013554121933129|computers.compone...|   intel| 435.28|551129779|4db2c365-ee85-443...|    2019-11-17|      08:43:00|   2019_nov|
|2019-11-17 08:43:00|      view|   3701538|2053013565983425517|appliances.enviro...|  irobot|1878.81|539845715|bf7d95c0-69e1-

In [23]:
parquet_df.orderBy("event_time").show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|event_time_ymd|event_time_hms|source_file|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+--------------+--------------+-----------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|    2019-10-01|      00:00:00|   2019_oct|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|    2019-10-01|      00:00:00|   2019_oct|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-

In [26]:
# SparkSession 종료
spark.stop()