# **시계열 데이터 처리**


## **1.환경준비**

### **(1) 라이브러리 불러오기**

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### **(2) 데이터 준비**

* 원본데이터 불러오기

In [53]:
sales = pd.read_csv("https://raw.githubusercontent.com/DA4BAM/dataset/master/ts_sales_simple.csv")
products = pd.read_csv("https://raw.githubusercontent.com/DA4BAM/dataset/master/ts_product_master.csv")

* 다룰 데이터 만들기
    * 아래 코드를 이해하지 못해도 상관 없습니다.
    * 그냥 실행해주세요.

In [54]:
# 판매액 계산하기
temp = pd.merge(sales, products)
temp['Amt'] = temp['Qty'] * temp['Price']
temp['Amt'] = (temp['Amt']/1000).round()  # 단위 1000달러

# 집계
data1 = temp.groupby(['Date', 'Category'], as_index = False)['Amt'].sum()
data2 = temp.groupby(['Date'], as_index = False)['Amt'].sum()

# 피봇
data11 = data1.pivot(index = 'Date', columns= 'Category', values ='Amt').reset_index()

# 합치기
data = pd.merge(data2, data11)
data

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods
0,2013-01-01,20.0,7.0,4.0,6.0,3.0
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0
5,2013-01-06,4066.0,689.0,544.0,1713.0,1120.0
6,2013-01-07,2700.0,431.0,349.0,1170.0,750.0
7,2013-01-08,2533.0,396.0,313.0,1144.0,680.0
8,2013-01-09,2462.0,405.0,336.0,1074.0,647.0
9,2013-01-10,2110.0,367.0,282.0,878.0,583.0


* 우리가 다룰 데이터

In [55]:
data.head()

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods
0,2013-01-01,20.0,7.0,4.0,6.0,3.0
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0


## **2.날짜 요소 추출**

### **(1) 날짜 타입으로 변환**
* pd.to_datetime(날짜데이터, format = '입력되는 날짜 형식')
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             31 non-null     object 
 1   Amt              31 non-null     float64
 2   Drink            31 non-null     float64
 3   Food             31 non-null     float64
 4   Grocery          31 non-null     float64
 5   Household Goods  31 non-null     float64
dtypes: float64(5), object(1)
memory usage: 1.6+ KB


In [57]:
data['Date'] = pd.to_datetime(data['Date'])

In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             31 non-null     datetime64[ns]
 1   Amt              31 non-null     float64       
 2   Drink            31 non-null     float64       
 3   Food             31 non-null     float64       
 4   Grocery          31 non-null     float64       
 5   Household Goods  31 non-null     float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 1.6 KB


* format = ''
    * pd.to_datetime(date, format = '%d/%m/%Y') # format = '%d/%m/%Y' 입력되는 날짜가 이런 형태야~~ 라고 알려주는 옵션
    * https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

In [59]:
# pd.to_datetime(date, format = '%d/%m/%Y')
date = pd.Series(['03-01-2023', '03-02-2023', '03-03-2023'])
date = pd.to_datetime(date)
date

0   2023-03-01
1   2023-03-02
2   2023-03-03
dtype: datetime64[ns]

In [60]:
# 만약 dd-mm-yyyy 이 맞다면!
date = pd.Series(['03-01-2023', '03-02-2023', '03-03-2023'])
date = pd.to_datetime(date, format = '%d-%m-%Y') #입력받은 날짜 데이터 형식이 '%d-%m-%Y'!
date

0   2023-01-03
1   2023-02-03
2   2023-03-03
dtype: datetime64[ns]

### **(2) 날짜 요소 추출**

In [61]:
# 연도
date.dt.year

0    2023
1    2023
2    2023
dtype: int32

In [62]:
# 월
date.dt.month

0    1
1    2
2    3
dtype: int32

In [63]:
# 일
date.dt.day

0    3
1    3
2    3
dtype: int32

In [64]:
# 요일 (0이 월요일)
date.dt.weekday

0    1
1    4
2    4
dtype: int32

In [65]:
# 요일 이름
date.dt.day_name()

0    Tuesday
1     Friday
2     Friday
dtype: object

<img src='https://raw.githubusercontent.com/jangrae/img/master/practice_01.png' width=120 align="left"/>

[문1] data의 Date는 이미 날짜 타입으로 변환되어 있습니다.  
* 다음의 항목을 열로 추가하시오.
    * 요일(이름)
    * 주차

In [66]:
data['WeekDay'] = data['Date'].dt.day_name()
data['Week'] = data['Date'].dt.isocalendar().week
data.tail()

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week
26,2013-01-27,3356.0,569.0,479.0,1449.0,859.0,Sunday,4
27,2013-01-28,2239.0,354.0,306.0,982.0,597.0,Monday,5
28,2013-01-29,2109.0,349.0,281.0,918.0,561.0,Tuesday,5
29,2013-01-30,2267.0,383.0,322.0,946.0,616.0,Wednesday,5
30,2013-01-31,2179.0,381.0,279.0,947.0,572.0,Thursday,5


## **3.시간에 따른 흐름 추가하기**

### **(1) shift**
* 시계열 데이터에서 시간의 흐름 전후로 정보를 이동시킬 때 사용
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html

In [67]:
temp = data.loc[:,['Date','Amt']]
temp

Unnamed: 0,Date,Amt
0,2013-01-01,20.0
1,2013-01-02,3938.0
2,2013-01-03,2885.0
3,2013-01-04,2907.0
4,2013-01-05,3831.0
5,2013-01-06,4066.0
6,2013-01-07,2700.0
7,2013-01-08,2533.0
8,2013-01-09,2462.0
9,2013-01-10,2110.0


In [68]:
# 전날 매출액 열을 추가합시다.
temp['Amt_lag'] = temp['Amt'].shift() #default = 1

# 전전날 매출액 열을 추가.
temp['Amt_lag2'] = temp['Amt'].shift(2) # 2행 shift

# 다음날 매출액 열을 추가합시다.
temp['Amt_lag_1'] = temp['Amt'].shift(-1)

temp.head()

Unnamed: 0,Date,Amt,Amt_lag,Amt_lag2,Amt_lag_1
0,2013-01-01,20.0,,,3938.0
1,2013-01-02,3938.0,20.0,,2885.0
2,2013-01-03,2885.0,3938.0,20.0,2907.0
3,2013-01-04,2907.0,2885.0,3938.0,3831.0
4,2013-01-05,3831.0,2907.0,2885.0,4066.0


### **(2) rolling + 집계함수**
* 시간의 흐름에 따라 일정 기간 동안 평균을 이동하면서 구하기
* .rolling : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
* rolling(n) : 
    * n 기본값은 1
    * min_periods : 최소 데이터수

In [105]:
# 7일 이동평균 매출액을 구해 봅시다.
temp['Amt'].rolling(7).mean()
temp['Amt_MA7_2'] = temp['Amt'].rolling(7, min_periods = 1).mean()
temp.head(10)

Unnamed: 0,Date,Amt,Amt_lag,Amt_lag2,Amt_lag_1,Amt_MA7_1,Amt_MA7_2,Amt_D1,Amt_D2
0,2013-01-01,20.0,,,3938.0,,20.0,,
1,2013-01-02,3938.0,20.0,,2885.0,,1979.0,3918.0,
2,2013-01-03,2885.0,3938.0,20.0,2907.0,,2281.0,-1053.0,2865.0
3,2013-01-04,2907.0,2885.0,3938.0,3831.0,,2437.5,22.0,-1031.0
4,2013-01-05,3831.0,2907.0,2885.0,4066.0,,2716.2,924.0,946.0
5,2013-01-06,4066.0,3831.0,2907.0,2700.0,,2941.166667,235.0,1159.0
6,2013-01-07,2700.0,4066.0,3831.0,2533.0,2906.714286,2906.714286,-1366.0,-1131.0
7,2013-01-08,2533.0,2700.0,4066.0,2462.0,3265.714286,3265.714286,-167.0,-1533.0
8,2013-01-09,2462.0,2533.0,2700.0,2110.0,3054.857143,3054.857143,-71.0,-238.0
9,2013-01-10,2110.0,2462.0,2533.0,2405.0,2944.142857,2944.142857,-352.0,-423.0


### (3) **diff**
* 특정 시점 데이터, 이전시점 데이터와의 차이 구하기
* .diff : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.diff.html

In [73]:
# 7일 이동평균 매출액을 구해 봅시다.
temp['Amt_D1'] = temp['Amt'].diff() # default=1
temp['Amt_D2'] = temp['Amt'].diff(2)
temp.head(10)

Unnamed: 0,Date,Amt,Amt_lag,Amt_lag2,Amt_lag_1,Amt_MA7_1,Amt_MA7_2,Amt_D1,Amt_D2
0,2013-01-01,20.0,,,3938.0,,20.0,,
1,2013-01-02,3938.0,20.0,,2885.0,,1979.0,3918.0,
2,2013-01-03,2885.0,3938.0,20.0,2907.0,,2281.0,-1053.0,2865.0
3,2013-01-04,2907.0,2885.0,3938.0,3831.0,,2437.5,22.0,-1031.0
4,2013-01-05,3831.0,2907.0,2885.0,4066.0,,2716.2,924.0,946.0
5,2013-01-06,4066.0,3831.0,2907.0,2700.0,,2941.166667,235.0,1159.0
6,2013-01-07,2700.0,4066.0,3831.0,2533.0,2906.714286,2906.714286,-1366.0,-1131.0
7,2013-01-08,2533.0,2700.0,4066.0,2462.0,3265.714286,3265.714286,-167.0,-1533.0
8,2013-01-09,2462.0,2533.0,2700.0,2110.0,3054.857143,3054.857143,-71.0,-238.0
9,2013-01-10,2110.0,2462.0,2533.0,2405.0,2944.142857,2944.142857,-352.0,-423.0


<img src='https://raw.githubusercontent.com/jangrae/img/master/practice_01.png' width=120 align="left"/>

[문1] data의 Grocery가 매출이 가장 높습니다. 이에 대해서 다음의 열을 추가해 봅시다.

* 전날 매출액
* 7일 전(전주 동 요일) 매출액
* 3일 이동평균 매출액
* 전날대비 매출액 증감여부 (증가 1, 감소 -1, 동일 0)


In [74]:
data.head()

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week
0,2013-01-01,20.0,7.0,4.0,6.0,3.0,Tuesday,1
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0,Wednesday,1
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0,Thursday,1
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0,Friday,1
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0,Saturday,1


In [91]:
# 전날 매출액
data['Gro_lag'] = data['Grocery'].shift()
data.head()

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week,before,before_7,3_moving_avg,up_or_down,befor_1e,Gro_lag
0,2013-01-01,20.0,7.0,4.0,6.0,3.0,Tuesday,1,,,,,,
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0,Wednesday,1,20.0,,,3918.0,6.0,6.0
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0,Thursday,1,3938.0,,2281.0,-1053.0,1663.0,1663.0
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0,Friday,1,2885.0,,3243.333333,22.0,1222.0,1222.0
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0,Saturday,1,2907.0,,3207.666667,924.0,1252.0,1252.0


In [93]:
# 7일 전(전주 동 요일) 매출액
data['Gro_lag7'] = data['Grocery'].shift(7)
data.head(10)

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week,before,before_7,3_moving_avg,up_or_down,befor_1e,Gro_lag,Gro_lag7
0,2013-01-01,20.0,7.0,4.0,6.0,3.0,Tuesday,1,,,,,,,
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0,Wednesday,1,20.0,,,3918.0,6.0,6.0,
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0,Thursday,1,3938.0,,2281.0,-1053.0,1663.0,1663.0,
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0,Friday,1,2885.0,,3243.333333,22.0,1222.0,1222.0,
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0,Saturday,1,2907.0,,3207.666667,924.0,1252.0,1252.0,
5,2013-01-06,4066.0,689.0,544.0,1713.0,1120.0,Sunday,1,3831.0,,3601.333333,235.0,1560.0,1560.0,
6,2013-01-07,2700.0,431.0,349.0,1170.0,750.0,Monday,2,4066.0,,3532.333333,-1366.0,1713.0,1713.0,
7,2013-01-08,2533.0,396.0,313.0,1144.0,680.0,Tuesday,2,2700.0,20.0,3099.666667,-167.0,1170.0,1170.0,6.0
8,2013-01-09,2462.0,405.0,336.0,1074.0,647.0,Wednesday,2,2533.0,3938.0,2565.0,-71.0,1144.0,1144.0,1663.0
9,2013-01-10,2110.0,367.0,282.0,878.0,583.0,Thursday,2,2462.0,2885.0,2368.333333,-352.0,1074.0,1074.0,1222.0


In [97]:
# 3일 이동평균 매출액
data['Gro_MA3'] = data['Grocery'].rolling(3, min_periods=1).mean()
data.head()

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week,before,before_7,3_moving_avg,up_or_down,befor_1e,Gro_lag,Gro_lag7,Gro_3MA,Gro_MA3
0,2013-01-01,20.0,7.0,4.0,6.0,3.0,Tuesday,1,,,,,,,,,6.0
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0,Wednesday,1,20.0,,,3918.0,6.0,6.0,,,834.5
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0,Thursday,1,3938.0,,2281.0,-1053.0,1663.0,1663.0,,963.666667,963.666667
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0,Friday,1,2885.0,,3243.333333,22.0,1222.0,1222.0,,1379.0,1379.0
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0,Saturday,1,2907.0,,3207.666667,924.0,1252.0,1252.0,,1344.666667,1344.666667


In [112]:
# 전날대비 매출액 증감여부 (증가 1, 감소 -1, 동일 0)
# np.where 이용
data['a'] = np.where(data['Grocery'].diff()>0, 1, (np.where(data['Grocery'].diff()<0, -1, 0)))
# pd.cut
data['d'] = data['Grocery'].diff()
data['b'] = pd.cut(data['d'], bins=[-np.inf, -1, 0, np.inf], labels=[-1, 0, 1])
data

Unnamed: 0,Date,Amt,Drink,Food,Grocery,Household Goods,WeekDay,Week,before,before_7,...,befor_1e,Gro_lag,Gro_lag7,Gro_3MA,Gro_MA3,Gro_Diff,Gro_Diff2,a,d,b
0,2013-01-01,20.0,7.0,4.0,6.0,3.0,Tuesday,1,,,...,,,,,6.0,,,0,,
1,2013-01-02,3938.0,604.0,549.0,1663.0,1122.0,Wednesday,1,20.0,,...,6.0,6.0,,,834.5,1657.0,1.0,1,1657.0,1.0
2,2013-01-03,2885.0,444.0,376.0,1222.0,843.0,Thursday,1,3938.0,,...,1663.0,1663.0,,963.666667,963.666667,-441.0,-1.0,-1,-441.0,-1.0
3,2013-01-04,2907.0,490.0,386.0,1252.0,779.0,Friday,1,2885.0,,...,1222.0,1222.0,,1379.0,1379.0,30.0,1.0,1,30.0,1.0
4,2013-01-05,3831.0,704.0,505.0,1560.0,1062.0,Saturday,1,2907.0,,...,1252.0,1252.0,,1344.666667,1344.666667,308.0,1.0,1,308.0,1.0
5,2013-01-06,4066.0,689.0,544.0,1713.0,1120.0,Sunday,1,3831.0,,...,1560.0,1560.0,,1508.333333,1508.333333,153.0,1.0,1,153.0,1.0
6,2013-01-07,2700.0,431.0,349.0,1170.0,750.0,Monday,2,4066.0,,...,1713.0,1713.0,,1481.0,1481.0,-543.0,-1.0,-1,-543.0,-1.0
7,2013-01-08,2533.0,396.0,313.0,1144.0,680.0,Tuesday,2,2700.0,20.0,...,1170.0,1170.0,6.0,1342.333333,1342.333333,-26.0,-1.0,-1,-26.0,-1.0
8,2013-01-09,2462.0,405.0,336.0,1074.0,647.0,Wednesday,2,2533.0,3938.0,...,1144.0,1144.0,1663.0,1129.333333,1129.333333,-70.0,-1.0,-1,-70.0,-1.0
9,2013-01-10,2110.0,367.0,282.0,878.0,583.0,Thursday,2,2462.0,2885.0,...,1074.0,1074.0,1222.0,1032.0,1032.0,-196.0,-1.0,-1,-196.0,-1.0
