# 날짜 인덱스를 활용하여 시계열(Time Series) 데이터 만들기

In [51]:
import pandas as pd

## 파일에서 읽어 올때 열을 선택하여 인덱스 지정하기

* 기본설정으로 읽을 경우

In [52]:
data = pd.read_csv('data/hanriver_bridge.csv', encoding='cp949')
data.head()                        

Unnamed: 0,Date,한강 좌측 인도,한강 우측 인도
0,10/03/2012 12:00:00 AM,4.0,9.0
1,10/03/2012 01:00:00 AM,4.0,6.0
2,10/03/2012 02:00:00 AM,1.0,1.0
3,10/03/2012 03:00:00 AM,2.0,3.0
4,10/03/2012 04:00:00 AM,6.0,1.0


In [73]:
# index_col=[원본의 열이름] => 원본의 열을 데이터프레임에서 인덱스로 활용
# 시계열 정보인 경우 문자열을 날짜로 변환해야 하기 때문에 parse_dates=True 옵션을 반드시 설정해야 한다.
data = pd.read_csv('data/hanriver_bridge.csv', index_col = 'Date', parse_dates = True, encoding='cp949')
data.head()   

  data = pd.read_csv('data/hanriver_bridge.csv', index_col = 'Date', parse_dates = True, encoding='cp949')


Unnamed: 0_level_0,한강 좌측 인도,한강 우측 인도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-10-03 00:00:00,4.0,9.0
2012-10-03 01:00:00,4.0,6.0
2012-10-03 02:00:00,1.0,1.0
2012-10-03 03:00:00,2.0,3.0
2012-10-03 04:00:00,6.0,1.0


In [74]:
data.shape

(49608, 2)

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49608 entries, 2012-10-03 00:00:00 to 2018-05-31 23:00:00
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   한강 좌측 인도  49600 non-null  float64
 1   한강 우측 인도  49600 non-null  float64
dtypes: float64(2)
memory usage: 1.1 MB


## 열이름 변경하기
* 데이터프레임으로 작업시 열이름을 통해 속성이나 메소드 호출 할수 있는 작업이 있기 때문에 공백문자는 가급적 제거한다.
* 데이터프레임.columns = [새로운 열이름 리스트]

In [98]:
data.columns = ['좌측인도_통행자수','우측인도_통행자수']
data.head()

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-10-03 00:00:00,4.0,9.0
2012-10-03 01:00:00,4.0,6.0
2012-10-03 02:00:00,1.0,1.0
2012-10-03 03:00:00,2.0,3.0
2012-10-03 04:00:00,6.0,1.0


* 총통행자수 열 추가하기

In [100]:
data['총통행자수'] = data['좌측인도_통행자수'] + data['우측인도_통행자수']
data.head()

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03 00:00:00,4.0,9.0,13.0
2012-10-03 01:00:00,4.0,6.0,10.0
2012-10-03 02:00:00,1.0,1.0,2.0
2012-10-03 03:00:00,2.0,3.0,5.0
2012-10-03 04:00:00,6.0,1.0,7.0


In [102]:
# data.drop(columns='총통행자수', inplace=True)
# data

## 결측치 확인

In [66]:
data.isnull().sum()

date         0
좌측인도_통행자수    8
우측인도_통행자수    8
총통행자수        8
dtype: int64

## 결측치 처리

* 회의 결과 결측치를 단순 삭제하기로 결정

In [67]:
data_dp = data.dropna()
data.shape, data_dp.shape

((49608, 4), (49600, 4))

In [69]:
data.isnull().sum()

date         0
좌측인도_통행자수    8
우측인도_통행자수    8
총통행자수        8
dtype: int64

# 시계열 데이터 재구성

## 날짜 기준

In [82]:
data

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03 00:00:00,4.0,9.0,13.0
2012-10-03 01:00:00,4.0,6.0,10.0
2012-10-03 02:00:00,1.0,1.0,2.0
2012-10-03 03:00:00,2.0,3.0,5.0
2012-10-03 04:00:00,6.0,1.0,7.0
...,...,...,...
2018-05-31 19:00:00,84.0,164.0,248.0
2018-05-31 20:00:00,27.0,93.0,120.0
2018-05-31 21:00:00,36.0,63.0,99.0
2018-05-31 22:00:00,21.0,30.0,51.0


In [83]:
daily = data.resample('D').sum()
daily

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03,1760.0,1761.0,3521.0
2012-10-04,1708.0,1767.0,3475.0
2012-10-05,1558.0,1590.0,3148.0
2012-10-06,1080.0,926.0,2006.0
2012-10-07,1191.0,951.0,2142.0
...,...,...,...
2018-05-27,1240.0,1386.0,2626.0
2018-05-28,1121.0,1303.0,2424.0
2018-05-29,1731.0,2717.0,4448.0
2018-05-30,2024.0,3053.0,5077.0


## 주 단위 

In [86]:
weekly = data.resample('W').sum()
weekly

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-07,7297.0,6995.0,14292.0
2012-10-14,8679.0,8116.0,16795.0
2012-10-21,7946.0,7563.0,15509.0
2012-10-28,6901.0,6536.0,13437.0
2012-11-04,6408.0,5786.0,12194.0
...,...,...,...
2018-05-06,11517.0,16081.0,27598.0
2018-05-13,11936.0,16557.0,28493.0
2018-05-20,12960.0,17564.0,30524.0
2018-05-27,12418.0,17800.0,30218.0


## 월단위 

In [85]:
monthly = data.resample('M').sum()
monthly

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-31,33764.0,31931.0,65695.0
2012-11-30,26062.0,24585.0,50647.0
2012-12-31,18608.0,17761.0,36369.0
2013-01-31,22910.0,21974.0,44884.0
2013-02-28,25898.0,24129.0,50027.0
...,...,...,...
2018-01-31,58591.0,23879.0,82470.0
2018-02-28,20703.0,29974.0,50677.0
2018-03-31,31995.0,45289.0,77284.0
2018-04-30,32993.0,46954.0,79947.0


# 시계열 데이터 셈플링
* 셈플링:전체 데이터 중에 구룹별 대표 데이터를 선정
## 일단위

In [87]:
daily_s = data.asfreq('D')
daily_s.head()

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-03,4.0,9.0,13.0
2012-10-04,7.0,11.0,18.0
2012-10-05,4.0,7.0,11.0
2012-10-06,8.0,7.0,15.0
2012-10-07,6.0,5.0,11.0


## 주단위

In [88]:
weekly_s = data.asfreq('W')
weekly_s.head()

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-07,6.0,5.0,11.0
2012-10-14,3.0,3.0,6.0
2012-10-21,5.0,12.0,17.0
2012-10-28,5.0,5.0,10.0
2012-11-04,7.0,11.0,18.0


## 월단위

In [89]:
monthly_s = data.asfreq('M')
monthly_s.head()

Unnamed: 0_level_0,좌측인도_통행자수,우측인도_통행자수,총통행자수
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-10-31,2.0,2.0,4.0
2012-11-30,5.0,2.0,7.0
2012-12-31,1.0,0.0,1.0
2013-01-31,4.0,1.0,5.0
2013-02-28,3.0,4.0,7.0
