# ecommerce 유저 행동 데이터
- [kaggle 데이터셋 링크](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store)

In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import copy
from tqdm import tqdm
import re

import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings(action='ignore')
pd.set_option("display.max_columns", None)

## 1. 데이터 불러오기

In [2]:
oct_2019_path = os.path.join("data", "raw_data", "2019-Oct.csv")
nov_2019_path = os.path.join("data", "raw_data", "2019-Nov.csv")

In [3]:
oct_2019_df = pd.read_csv(oct_2019_path)
nov_2019_df = pd.read_csv(nov_2019_path)

In [4]:
oct_2019_df.info(show_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42448764 entries, 0 to 42448763
Data columns (total 9 columns):
 #   Column         Non-Null Count     Dtype  
---  ------         --------------     -----  
 0   event_time     42448764 non-null  object 
 1   event_type     42448764 non-null  object 
 2   product_id     42448764 non-null  int64  
 3   category_id    42448764 non-null  int64  
 4   category_code  28933155 non-null  object 
 5   brand          36331684 non-null  object 
 6   price          42448764 non-null  float64
 7   user_id        42448764 non-null  int64  
 8   user_session   42448762 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 2.8+ GB


In [5]:
nov_2019_df.info(show_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67501979 entries, 0 to 67501978
Data columns (total 9 columns):
 #   Column         Non-Null Count     Dtype  
---  ------         --------------     -----  
 0   event_time     67501979 non-null  object 
 1   event_type     67501979 non-null  object 
 2   product_id     67501979 non-null  int64  
 3   category_id    67501979 non-null  int64  
 4   category_code  45603808 non-null  object 
 5   brand          58277901 non-null  object 
 6   price          67501979 non-null  float64
 7   user_id        67501979 non-null  int64  
 8   user_session   67501969 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 4.5+ GB


In [6]:
oct_2019_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [7]:
nov_2019_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01 UTC,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01 UTC,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


## 2. 파생 컬럼 생성

In [9]:
# event_time에서 날짜와 시간 컬럼 분리
def seperate_event_time(df, event_time_col="event_time"):
    df_copy = df.copy()

    # datetime으로 변환
    df_copy[event_time_col] = pd.to_datetime(df_copy[event_time_col])

    # 날짜와 시간 컬럼 분리
    df_copy['event_time_ymd'] = df_copy[event_time_col].dt.date
    df_copy['event_time_hms'] = df_copy[event_time_col].dt.time

    return df_copy

In [10]:
oct_2019_time_df = seperate_event_time(oct_2019_df)

In [11]:
nov_2019_time_df = seperate_event_time(nov_2019_df)

In [12]:
oct_2019_time_df.head(3)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_ymd,event_time_hms
0,2019-10-01 00:00:00+00:00,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,2019-10-01,00:00:00
1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01,00:00:00
2,2019-10-01 00:00:01+00:00,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,2019-10-01,00:00:01


In [13]:
nov_2019_time_df.head(3)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_ymd,event_time_hms
0,2019-11-01 00:00:00+00:00,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,2019-11-01,00:00:00
1,2019-11-01 00:00:00+00:00,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283,2019-11-01,00:00:00
2,2019-11-01 00:00:01+00:00,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387,2019-11-01,00:00:01


## 3. 데이터셋 parquet 저장

In [14]:
data_dict = {"oct_2019": oct_2019_time_df,
             "nov_2019": nov_2019_time_df}

In [15]:
def make_parquets(data_dict=data_dict, file_type="parquet"):

    # 기본 경로 설정 (parquet_data)
    parquet_path = os.path.join("data", "parquet_data")
    os.makedirs(parquet_path, exist_ok=True)

    for key in data_dict.keys():
        print(f"{key} 데이터셋 저장 시작 \n저장경로 {parquet_path}\n")
        data_dict[key].to_parquet(os.path.join(parquet_path, f"{key}.{file_type}"), engine="pyarrow")
    
    print(f"================= {parquet_path} 폴더 내 데이터 갯수: {len(os.listdir(parquet_path))} ==================")

In [16]:
make_parquets(data_dict)

oct_2019 데이터셋 저장 시작 
저장경로 data\parquet_data

nov_2019 데이터셋 저장 시작 
저장경로 data\parquet_data



## 4. parquet 파일 세팅

In [25]:
oct_2019_parquet_path = os.path.join("data", "parquet_data", "oct_2019.parquet")
nov_2019_parquet_path = os.path.join("data", "parquet_data", "nov_2019.parquet")

In [27]:
oct_2019_parquet_df = pd.read_parquet(oct_2019_parquet_path, engine="pyarrow")

ArrowMemoryError: malloc of size 1073741824 failed

In [None]:
nov_2019_parquet_df = pd.read_parquet(nov_2019_parquet_path, engine="pyarrow")