## 카드 소비 데이터

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import dask.dataframe as dd
import vaex

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as spst

In [3]:
data = '../data/five_city_data/merged_gyeonggi_day.csv'

In [4]:
# Dask로 5개 주요시 데이터 읽기
card_df = dd.read_csv(data,encoding='utf-8')


In [5]:
card_df.head()

Unnamed: 0,ta_ymd,cty_rgn_no,admi_cty_no,card_tpbuz_cd,card_tpbuz_nm_1,card_tpbuz_nm_2,hour,sex,age,day,amt,cnt
0,20230101,41210,41210510,D05,소매/유통,선물/완구,3,M,5,7,57576,2
1,20230101,41210,41210510,D11,소매/유통,종합소매점,3,F,7,7,8523,2
2,20230101,41210,41210510,D11,소매/유통,종합소매점,7,F,7,7,8023,2
3,20230101,41210,41210510,F02,생활서비스,미용서비스,3,M,4,7,28532,2
4,20230101,41210,41210510,F02,생활서비스,미용서비스,4,F,7,7,74074,2


In [6]:
card_df.isna().sum()

Dask Series Structure:
npartitions=1
admi_cty_no    int64
ta_ymd           ...
dtype: int64
Dask Name: dataframe-sum-agg, 4 graph layers

In [7]:
card_df.count().compute(scheduler='processes', num_workers=2)

ta_ymd             115322615
cty_rgn_no         115322615
admi_cty_no        115322615
card_tpbuz_cd      115322615
card_tpbuz_nm_1    115322615
card_tpbuz_nm_2    115322615
hour               115322615
sex                115322615
age                115322615
day                115322615
amt                115322615
cnt                115322615
dtype: int64

In [8]:
#card_df.sort_values(by=['ta_ymd', 'card_tpbuz_nm_1','card_tpbuz_nm_2'])

In [9]:
#card_df.groupby(['ta_ymd', 'card_tpbuz_nm_1','card_tpbuz_nm_2', 'sex'])

In [10]:
card_df['cty_rgn_no'].nunique().compute()

11

### 코드를 위한 데이터 분리 및 결합

In [11]:
code_mapping = card_df[["card_tpbuz_cd",	"card_tpbuz_nm_1",	"card_tpbuz_nm_2"	]].drop_duplicates()

In [12]:
#해당 일, 특정행정동의 업종별, 연령별, 나이별 매출액 평균과 매출 건수
# grouped_df = card_df.groupby(["ta_ymd", "admi_cty_no", "card_tpbuz_cd", "sex", "age"]).agg({"amt": "mean", "cnt": "sum"}).reset_index()

In [13]:
# grouped_df.sort_values(by=["ta_ymd", "card_tpbuz_cd","admi_cty_no"])

In [14]:
# 해당 일의 해당동의 업종 별 매출 평균
time_df = card_df.groupby(["ta_ymd", "admi_cty_no", "card_tpbuz_cd"]).agg({"amt": "mean"}).reset_index()
# time_df의 해당 일의 해당 업종의 평균
time_df2 = time_df.groupby(['ta_ymd',	'card_tpbuz_cd']).agg({"amt": "mean"}).reset_index()

In [15]:
time_df2.head()

Unnamed: 0,ta_ymd,card_tpbuz_cd,amt
0,20230101,D05,107508.458419
1,20230101,D11,872746.393065
2,20230101,F02,202663.628225
3,20230101,Q01,191551.970288
4,20230101,Q15,383799.991835


In [16]:
#grouped_df['ta_ymd'] = pd.to_datetime(grouped_df['ta_ymd'], format='%Y%m%d')
# 다음날 매출을 라벨로 추가
#grouped_df['next_day_sales'] = grouped_df.groupby(['admi_cty_no', 'card_tpbuz_cd', 'sex', 'age'])['amt'].shift(-1)
# 데이터 확인
# grouped_df[(grouped_df['ta_ymd'] == '2023-01-01') & 
#                              (grouped_df['admi_cty_no'] == 41210510) & 
#                              (grouped_df['card_tpbuz_cd'] == 'F02')]

In [17]:
# 40104
len(time_df2)

40104

### 날짜 타입 변환

In [18]:
time_df2 = time_df2.compute()

In [19]:
# time_df2['ta_ymd'] = time_df2['ta_ymd'].astype(str)
time_df2['ta_ymd'] = pd.to_datetime(time_df2['ta_ymd'], format='%Y%m%d')

## 모델 학습 -> AutoML 활용

In [20]:
# autogluon 사용
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [21]:
# train, test 분리
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(time_df2, test_size=0.2, random_state=42)

In [22]:
train_data = TimeSeriesDataFrame.from_data_frame(train_df, id_column = "card_tpbuz_cd", timestamp_column="ta_ymd")

In [23]:
train_data

Unnamed: 0_level_0,Unnamed: 1_level_0,amt
item_id,timestamp,Unnamed: 2_level_1
F13,2023-11-22,248676.084841
D18,2023-02-07,401040.608796
Q16,2023-05-31,59478.441461
S01,2023-08-14,180842.652182
Y05,2024-01-02,629632.694940
...,...,...
R05,2023-03-19,557640.607402
F10,2023-05-19,439777.803723
S02,2024-04-08,282288.052439
F13,2023-01-12,358973.195764


In [24]:
train_data = TimeSeriesDataFrame.from_data_frame(time_df2, id_column = "card_tpbuz_cd", timestamp_column="ta_ymd")

In [25]:
train_data

Unnamed: 0_level_0,Unnamed: 1_level_0,amt
item_id,timestamp,Unnamed: 2_level_1
D05,2023-01-01,107508.458419
D11,2023-01-01,872746.393065
F02,2023-01-01,202663.628225
Q01,2023-01-01,191551.970288
Q15,2023-01-01,383799.991835
...,...,...
T03,2024-04-30,43657.357143
T03,2024-04-15,789651.000000
T03,2024-04-16,139113.000000
U03,2024-04-23,339402.166667


In [26]:
predictor = TimeSeriesPredictor(label="amt", prediction_length=28, eval_metric="RMSE", freq="D")

In [None]:
predictor.fit(train_data, random_seed = 42,time_limit=3600)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'AutogluonModels\ag-20240711_082035'
AutoGluon Version:  1.1.1
Python Version:     3.8.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
GPU Count:          0
Memory Avail:       19.65 GB / 31.69 GB (62.0%)
Disk Space Avail:   592.41 GB / 930.86 GB (63.6%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': RMSE,
 'freq': 'D',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 28,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 42,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'amt',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'None' has been resampled to frequency 'D'.
Provided train_data has 40337 rows (NaN fraction=0.6%), 83 time series. Median time series length is 486 (