# Time Series

## 10.1 날짜, 시간 자료형, 도구

In [1]:
from datetime import datetime
now = datetime.now()

In [2]:
now

datetime.datetime(2018, 6, 5, 16, 43, 46, 675712)

In [3]:
now.year, now.month, now.day

(2018, 6, 5)

In [4]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)

In [5]:
delta

datetime.timedelta(926, 56700)

In [6]:
delta.days

926

In [7]:
delta.seconds

56700

In [8]:
from datetime import timedelta

In [9]:
start = datetime(2011, 1, 7)

In [10]:
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

### 10.1.1 문자열을 datetime으로 변환하기

In [11]:
stamp = datetime(2011, 1, 3)

In [12]:
str(stamp)

'2011-01-03 00:00:00'

In [13]:
stamp.strftime('%Y%m%d')

'20110103'

In [14]:
value = '20110103'

In [15]:
datetime.strptime(value, '%Y%m%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [16]:
datestrs = ['20110706', '20110806']

In [17]:
[datetime.strptime(x, '%Y%m%d') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [18]:
from dateutil.parser import parse

In [19]:
parse('20110103')

datetime.datetime(2011, 1, 3, 0, 0)

In [20]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [23]:
datestrs = ['20180101', '20180105']

In [25]:
pd.to_datetime(datestrs)

DatetimeIndex(['2018-01-01', '2018-01-05'], dtype='datetime64[ns]', freq=None)

In [26]:
idx = pd.to_datetime(datestrs + [None])

In [27]:
idx

DatetimeIndex(['2018-01-01', '2018-01-05', 'NaT'], dtype='datetime64[ns]', freq=None)

In [28]:
idx[2]

NaT

In [29]:
pd.isnull(idx)

array([False, False,  True])

## 10.2 시계열 기초

In [30]:
from datetime import datetime
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
         datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12), ]

In [31]:
ts = Series(np.random.randn(6), index=dates)

In [32]:
ts

2011-01-02   -0.355467
2011-01-05    1.254204
2011-01-07   -0.304252
2011-01-08   -0.846848
2011-01-10   -1.542524
2011-01-12   -1.034264
dtype: float64

In [33]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [34]:
ts + ts[::2]

2011-01-02   -0.710935
2011-01-05         NaN
2011-01-07   -0.608504
2011-01-08         NaN
2011-01-10   -3.085048
2011-01-12         NaN
dtype: float64

In [35]:
ts.index.dtype

dtype('<M8[ns]')

In [36]:
stamp = ts.index[0]

In [37]:
stamp

Timestamp('2011-01-02 00:00:00')

### 10.2.1 인덱싱, 선택, 부분 선택

In [38]:
stamp = ts.index[2]

In [39]:
ts[stamp]

-0.30425213685989405

In [40]:
ts['20110110']

-1.5425241040895006

In [44]:
longer_ts = Series(np.random.randn(1000),
                   index=pd.date_range('20010101', periods=1000))

In [45]:
longer_ts

2001-01-01   -0.918714
2001-01-02    0.063439
2001-01-03   -0.529549
2001-01-04   -0.899537
2001-01-05    0.262479
2001-01-06    0.038794
2001-01-07   -0.466589
2001-01-08    0.676379
2001-01-09    1.014978
2001-01-10   -1.210502
2001-01-11    0.140998
2001-01-12    0.951305
2001-01-13    0.572374
2001-01-14   -1.337099
2001-01-15    1.913552
2001-01-16    0.148119
2001-01-17    1.470510
2001-01-18    0.796997
2001-01-19    1.353082
2001-01-20   -1.183532
2001-01-21   -0.671283
2001-01-22    0.508855
2001-01-23   -0.329780
2001-01-24    0.099356
2001-01-25    0.653901
2001-01-26   -1.119443
2001-01-27   -0.373652
2001-01-28    1.497376
2001-01-29   -0.466143
2001-01-30    0.083662
                ...   
2003-08-29   -0.661477
2003-08-30   -1.054992
2003-08-31    0.587520
2003-09-01   -0.708313
2003-09-02    2.389426
2003-09-03   -1.061731
2003-09-04   -1.180413
2003-09-05    0.129827
2003-09-06    1.329608
2003-09-07   -1.118883
2003-09-08   -0.248769
2003-09-09   -0.099602
2003-09-10 

In [46]:
longer_ts['2002']

2002-01-01    0.179617
2002-01-02    1.290589
2002-01-03   -1.360132
2002-01-04    0.167339
2002-01-05   -0.531810
2002-01-06   -1.207810
2002-01-07    0.591115
2002-01-08    0.282713
2002-01-09   -0.148265
2002-01-10   -1.825137
2002-01-11    0.094304
2002-01-12    0.465717
2002-01-13   -0.527869
2002-01-14   -1.161194
2002-01-15    1.579323
2002-01-16   -1.039615
2002-01-17   -0.100978
2002-01-18   -0.774890
2002-01-19    0.488082
2002-01-20   -0.160104
2002-01-21   -0.507559
2002-01-22    0.131769
2002-01-23    0.889775
2002-01-24   -0.257072
2002-01-25   -1.694926
2002-01-26   -1.249533
2002-01-27    1.077517
2002-01-28   -3.014098
2002-01-29   -1.391209
2002-01-30    0.144385
                ...   
2002-12-02    0.078407
2002-12-03    0.176178
2002-12-04   -0.726248
2002-12-05    0.140660
2002-12-06   -0.430441
2002-12-07   -0.092383
2002-12-08    0.055795
2002-12-09   -0.488360
2002-12-10   -0.694969
2002-12-11    0.435814
2002-12-12    0.628099
2002-12-13   -1.901822
2002-12-14 

In [48]:
longer_ts['2001-05']

2001-05-01   -0.439857
2001-05-02   -0.785782
2001-05-03    0.316175
2001-05-04    0.388160
2001-05-05    1.480572
2001-05-06   -0.116785
2001-05-07    1.363255
2001-05-08    0.155316
2001-05-09    0.354145
2001-05-10    1.107702
2001-05-11    1.045475
2001-05-12   -0.983131
2001-05-13    0.049386
2001-05-14   -0.543390
2001-05-15    1.129913
2001-05-16   -0.598411
2001-05-17   -0.204114
2001-05-18    0.419274
2001-05-19    1.039381
2001-05-20   -0.535609
2001-05-21    1.652581
2001-05-22   -0.929300
2001-05-23   -1.619338
2001-05-24   -0.648996
2001-05-25    2.183649
2001-05-26   -1.751191
2001-05-27   -0.687203
2001-05-28   -1.664024
2001-05-29    0.616776
2001-05-30    1.677939
2001-05-31   -0.424036
Freq: D, dtype: float64

In [49]:
ts[datetime(2011, 1, 7):]

2011-01-07   -0.304252
2011-01-08   -0.846848
2011-01-10   -1.542524
2011-01-12   -1.034264
dtype: float64

In [50]:
dates = pd.date_range('20010101', periods=100, freq='W-WED')

In [51]:
long_df = DataFrame(np.random.randn(100, 4),
                    index=dates,
                    columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [53]:
long_df['2001-5']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.537807,0.671829,-1.216509,0.434295
2001-05-09,0.75761,1.560468,0.523582,-1.113991
2001-05-16,-1.124659,0.13042,1.275136,-2.075554
2001-05-23,-0.241755,0.732664,1.059881,-0.544172
2001-05-30,-1.58959,0.488315,-0.618267,-0.874147


In [54]:
long_df.loc['2001-5']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.537807,0.671829,-1.216509,0.434295
2001-05-09,0.75761,1.560468,0.523582,-1.113991
2001-05-16,-1.124659,0.13042,1.275136,-2.075554
2001-05-23,-0.241755,0.732664,1.059881,-0.544172
2001-05-30,-1.58959,0.488315,-0.618267,-0.874147


### 10.2.2 중복된 색인을 갖는 시계열

In [55]:
dates = pd.DatetimeIndex(['20010101', '20010102', '20010102', '20010102', '20010103', ])

In [56]:
dup_ts = Series(np.arange(5), index=dates)

In [57]:
dup_ts

2001-01-01    0
2001-01-02    1
2001-01-02    2
2001-01-02    3
2001-01-03    4
dtype: int32

In [59]:
dup_ts.index.is_unique

False

In [60]:
dup_ts['20010103']

4

In [61]:
dup_ts['20010102']

2001-01-02    1
2001-01-02    2
2001-01-02    3
dtype: int32

## 10.3 날짜 범위, 빈도, 이동

In [63]:
ts

2011-01-02   -0.355467
2011-01-05    1.254204
2011-01-07   -0.304252
2011-01-08   -0.846848
2011-01-10   -1.542524
2011-01-12   -1.034264
dtype: float64

In [70]:
ts.resample('D').asfreq()

2011-01-02   -0.355467
2011-01-03         NaN
2011-01-04         NaN
2011-01-05    1.254204
2011-01-06         NaN
2011-01-07   -0.304252
2011-01-08   -0.846848
2011-01-09         NaN
2011-01-10   -1.542524
2011-01-11         NaN
2011-01-12   -1.034264
Freq: D, dtype: float64

### 10.3.1 날짜 범위 생성하기

In [71]:
index = pd.date_range('20120104', '20120601')

In [72]:
index

DatetimeIndex(['2012-01-04', '2012-01-05', '2012-01-06', '2012-01-07',
               '2012-01-08', '2012-01-09', '2012-01-10', '2012-01-11',
               '2012-01-12', '2012-01-13',
               ...
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
               '2012-05-27', '2012-05-28', '2012-05-29', '2012-05-30',
               '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', length=150, freq='D')

In [73]:
pd.date_range(start='20120401', periods=20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [74]:
pd.date_range('20180101', '20180605', freq='BM')

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-30', '2018-04-30',
               '2018-05-31'],
              dtype='datetime64[ns]', freq='BM')

In [76]:
# 국내 휴일은 체크 안됨
pd.date_range('20140101', '20140605', freq='BM')

DatetimeIndex(['2014-01-31', '2014-02-28', '2014-03-31', '2014-04-30',
               '2014-05-30'],
              dtype='datetime64[ns]', freq='BM')

### 10.3.2 빈도와 날짜 오프셋

In [78]:
from pandas.tseries.offsets import Hour, Minute

In [79]:
hour = Hour()

In [80]:
hour

<Hour>

In [81]:
four_hour = Hour(4)

In [82]:
four_hour

<4 * Hours>

In [83]:
pd.date_range('20180604', '20180605', freq='4h')

DatetimeIndex(['2018-06-04 00:00:00', '2018-06-04 04:00:00',
               '2018-06-04 08:00:00', '2018-06-04 12:00:00',
               '2018-06-04 16:00:00', '2018-06-04 20:00:00',
               '2018-06-05 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [84]:
rng = pd.date_range('20180101', '20180801', freq='WOM-3FRI')

In [85]:
rng

DatetimeIndex(['2018-01-19', '2018-02-16', '2018-03-16', '2018-04-20',
               '2018-05-18', '2018-06-15', '2018-07-20'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

### 10.3.3 데이터 시프트

In [87]:
ts = Series(np.random.randn(4),
            index=pd.date_range('20180101', periods=4, freq='M'))

In [88]:
ts

2018-01-31    1.992777
2018-02-28    0.591678
2018-03-31   -0.409012
2018-04-30   -0.062399
Freq: M, dtype: float64

In [89]:
ts.shift(2)

2018-01-31         NaN
2018-02-28         NaN
2018-03-31    1.992777
2018-04-30    0.591678
Freq: M, dtype: float64

In [90]:
ts.shift(2, freq='M')

2018-03-31    1.992777
2018-04-30    0.591678
2018-05-31   -0.409012
2018-06-30   -0.062399
Freq: M, dtype: float64

In [92]:
ts.shift(3, freq='D')

2018-02-03    1.992777
2018-03-03    0.591678
2018-04-03   -0.409012
2018-05-03   -0.062399
dtype: float64

In [93]:
ts.shift(1, freq='3D')

2018-02-03    1.992777
2018-03-03    0.591678
2018-04-03   -0.409012
2018-05-03   -0.062399
dtype: float64

In [94]:
ts.shift(1, freq='90T')

2018-01-31 01:30:00    1.992777
2018-02-28 01:30:00    0.591678
2018-03-31 01:30:00   -0.409012
2018-04-30 01:30:00   -0.062399
Freq: M, dtype: float64

#### 오프셋만큼 날짜 시프트하기

In [95]:
from pandas.tseries.offsets import Day, MonthEnd

In [96]:
now = datetime(2018, 6, 5)

In [97]:
now + 3 * Day()

Timestamp('2018-06-08 00:00:00')

In [98]:
now + MonthEnd()

Timestamp('2018-06-30 00:00:00')

In [99]:
now + MonthEnd(2)

Timestamp('2018-07-31 00:00:00')

In [100]:
offset = MonthEnd()

In [101]:
offset

<MonthEnd>

In [102]:
offset.rollforward(now)

Timestamp('2018-06-30 00:00:00')

In [103]:
offset.rollback(now)

Timestamp('2018-05-31 00:00:00')

In [106]:
ts = Series(np.random.randn(20),
            index=pd.date_range('20180115', periods=20, freq='4d'))

In [107]:
ts

2018-01-15   -0.356301
2018-01-19    0.097275
2018-01-23    0.356378
2018-01-27   -0.841419
2018-01-31    0.025894
2018-02-04   -0.794029
2018-02-08   -0.987814
2018-02-12    0.556544
2018-02-16   -0.093142
2018-02-20    0.629334
2018-02-24    1.140887
2018-02-28   -0.600543
2018-03-04   -0.993529
2018-03-08    1.067871
2018-03-12    0.011895
2018-03-16    0.622868
2018-03-20    0.934614
2018-03-24   -0.170221
2018-03-28   -0.028000
2018-04-01   -0.931407
Freq: 4D, dtype: float64

In [108]:
ts.groupby(offset.rollforward).mean()

2018-01-31   -0.143635
2018-02-28   -0.021252
2018-03-31    0.206500
2018-04-30   -0.931407
dtype: float64

In [110]:
ts.resample('M').mean()

2018-01-31   -0.143635
2018-02-28   -0.021252
2018-03-31    0.206500
2018-04-30   -0.931407
Freq: M, dtype: float64