<a href="https://colab.research.google.com/github/anicelysantos/book-python-para-analise-de-dados/blob/main/series_temporais.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Estudos do livro "Python para análise de dados" cap. 11*

# **Imports**

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse

# **Tipos de dados e ferramentas para data e hora**

In [3]:
now = datetime.now()
now

datetime.datetime(2021, 9, 22, 11, 41, 41, 665579)

In [4]:
#now.year, now.month, now.day
now.day, now.month, now.year

(22, 9, 2021)

In [5]:
#import lá em cima timedelta
#timedelta mostra a diferença de tempo entre dois objetos datetime

delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [6]:
delta.days

926

In [7]:
delta.seconds

56700

In [8]:
start = datetime(2011,1,7)
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [9]:
start - 2 * timedelta(12)


datetime.datetime(2010, 12, 14, 0, 0)

**Conversão entre string e datetime**

In [10]:
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [11]:
#stamp.strftime('%Y-%m-%d')
stamp.strftime('%F') #mesma coisa de cima

'2011-01-03'

In [12]:
value = '2011-01-03'

In [13]:
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [14]:
datestrs = ['9/19/2021','9/20/2020']
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2021, 9, 19, 0, 0), datetime.datetime(2020, 9, 20, 0, 0)]

In [15]:
#Import do parse lá no inicio
parse('21-09-2021')

datetime.datetime(2021, 9, 21, 0, 0)

In [16]:
parse('Sep 21, 2021 09:02 AM')

datetime.datetime(2021, 9, 21, 9, 2)

In [17]:
#Dia antes do mês, padrão Brasil
parse('21/09/2021', dayfirst=True)

datetime.datetime(2021, 9, 21, 0, 0)

In [18]:
datastrs = ['2021-09-21 09:04:00', '2021-09-21 09:05:00']
pd.to_datetime(datastrs)

DatetimeIndex(['2021-09-21 09:04:00', '2021-09-21 09:05:00'], dtype='datetime64[ns]', freq=None)

In [19]:
#Lidar com dados ausentes (None)
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2021-09-19', '2020-09-20', 'NaT'], dtype='datetime64[ns]', freq=None)

In [20]:
#NaT é o valor nulo do pandas para tempo/data (dados de timestamp)
idx[2]

NaT

In [21]:
pd.isnull(idx)

array([False, False,  True])

# **Básico sobre séries temporais**

In [22]:
#import datetime no inicio
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011,1,10), datetime(2011, 1, 12)]

In [24]:
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02   -0.049043
2011-01-05    0.597614
2011-01-07   -0.897128
2011-01-08    0.042157
2011-01-10   -0.373992
2011-01-12    1.908332
dtype: float64

In [25]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [26]:
#[::2]seleciona um elemento a cada dois
ts + ts[::2]

2011-01-02   -0.098085
2011-01-05         NaN
2011-01-07   -1.794256
2011-01-08         NaN
2011-01-10   -0.747985
2011-01-12         NaN
dtype: float64

In [27]:
ts.index.dtype

dtype('<M8[ns]')

In [29]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

**Indexação, seleção e geração de subconjuntos**

In [31]:
stamp = ts.index[2]
ts[stamp]

Timestamp('2011-01-07 00:00:00')

In [32]:
#Passar uma string para ser interpretada como data
ts['1/10/2011']

-0.3739923245232917

In [33]:
ts['20110110']

-0.3739923245232917

In [34]:
#series temporais mais longas podem selecionar o ano ou só o ano e o mês para trazer trechos do dataframe
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts

2000-01-01    0.813868
2000-01-02   -0.554250
2000-01-03    0.095744
2000-01-04   -0.940219
2000-01-05    0.170331
                ...   
2002-09-22   -0.558318
2002-09-23    1.310893
2002-09-24    1.646768
2002-09-25   -1.336099
2002-09-26    1.658198
Freq: D, Length: 1000, dtype: float64

In [35]:
longer_ts['2001']

2001-01-01   -1.170078
2001-01-02    1.251340
2001-01-03   -0.373497
2001-01-04   -0.424516
2001-01-05   -0.177810
                ...   
2001-12-27    0.518535
2001-12-28    0.126788
2001-12-29   -2.686233
2001-12-30   -0.832038
2001-12-31   -1.159397
Freq: D, Length: 365, dtype: float64

In [36]:
longer_ts['2001-05']

2001-05-01   -1.156609
2001-05-02    0.778430
2001-05-03    0.178365
2001-05-04   -0.187980
2001-05-05   -0.436687
2001-05-06    0.527802
2001-05-07    1.378438
2001-05-08   -0.715073
2001-05-09    1.454912
2001-05-10    1.162085
2001-05-11    2.118873
2001-05-12    0.990931
2001-05-13    0.513122
2001-05-14    0.063757
2001-05-15    1.218217
2001-05-16   -1.178627
2001-05-17    0.614867
2001-05-18   -0.084719
2001-05-19    1.614003
2001-05-20    0.860555
2001-05-21   -0.646499
2001-05-22    0.430948
2001-05-23    0.071378
2001-05-24    0.515691
2001-05-25    0.450819
2001-05-26   -0.542976
2001-05-27    0.099994
2001-05-28   -0.614994
2001-05-29    1.242413
2001-05-30   -0.690233
2001-05-31   -0.544892
Freq: D, dtype: float64

In [37]:
ts[datetime(2011, 1, 7):]

2011-01-07   -0.897128
2011-01-08    0.042157
2011-01-10   -0.373992
2011-01-12    1.908332
dtype: float64

In [38]:
ts

2011-01-02   -0.049043
2011-01-05    0.597614
2011-01-07   -0.897128
2011-01-08    0.042157
2011-01-10   -0.373992
2011-01-12    1.908332
dtype: float64

In [39]:
ts['1/6/2011':'1/11/2011']

2011-01-07   -0.897128
2011-01-08    0.042157
2011-01-10   -0.373992
dtype: float64

In [40]:
#Fatiar um periodo entre duas datas
ts.truncate(after='1/9/2011')

2011-01-02   -0.049043
2011-01-05    0.597614
2011-01-07   -0.897128
2011-01-08    0.042157
dtype: float64

In [42]:
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [43]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.327403,-1.650751,0.489609,0.35309
2001-05-09,1.54882,-0.125562,-0.665977,-1.043577
2001-05-16,-1.279305,1.20158,0.292385,0.268549
2001-05-23,1.352893,0.333656,0.180314,1.06904
2001-05-30,0.390916,-0.117687,0.993996,1.114827


**Séries temporais com índices duplicados**

In [45]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)

In [46]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [47]:
dup_ts.index.is_unique

False

In [48]:
dup_ts['1/3/2000']

4

In [49]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [50]:
#agregar os dados únicos
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int64

In [51]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

# **Intervalos de datas, frequências e deslocamentos**

In [52]:
ts

2011-01-02   -0.049043
2011-01-05    0.597614
2011-01-07   -0.897128
2011-01-08    0.042157
2011-01-10   -0.373992
2011-01-12    1.908332
dtype: float64

In [None]:
#String 'D' é o equivalente a frequência diária
resampler = ts.resample('D')


**Gerando intervalos de datas**

In [55]:
#pandas.date_range gera index com tamanho especifico
index = pd.date_range('2012-04-01', '2012-06-01')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [58]:
#Se passar só a data do inicio ou do fim, precisa passar o periodo
pd.date_range(start='2021-08-01', periods=20)

DatetimeIndex(['2021-08-01', '2021-08-02', '2021-08-03', '2021-08-04',
               '2021-08-05', '2021-08-06', '2021-08-07', '2021-08-08',
               '2021-08-09', '2021-08-10', '2021-08-11', '2021-08-12',
               '2021-08-13', '2021-08-14', '2021-08-15', '2021-08-16',
               '2021-08-17', '2021-08-18', '2021-08-19', '2021-08-20'],
              dtype='datetime64[ns]', freq='D')

In [59]:
pd.date_range(end='2021-09-01', periods=20)

DatetimeIndex(['2021-08-13', '2021-08-14', '2021-08-15', '2021-08-16',
               '2021-08-17', '2021-08-18', '2021-08-19', '2021-08-20',
               '2021-08-21', '2021-08-22', '2021-08-23', '2021-08-24',
               '2021-08-25', '2021-08-26', '2021-08-27', '2021-08-28',
               '2021-08-29', '2021-08-30', '2021-08-31', '2021-09-01'],
              dtype='datetime64[ns]', freq='D')

In [63]:
#'BM' trás um trecho de data contendo o ultimo dia útil de cada mês
pd.date_range('2020-01-01','2021-01-01', freq='BM')

DatetimeIndex(['2020-01-31', '2020-02-28', '2020-03-31', '2020-04-30',
               '2020-05-29', '2020-06-30', '2020-07-31', '2020-08-31',
               '2020-09-30', '2020-10-30', '2020-11-30', '2020-12-31'],
              dtype='datetime64[ns]', freq='BM')

In [64]:
pd.date_range('2021-07-03 12:56:31', periods=5)

DatetimeIndex(['2021-07-03 12:56:31', '2021-07-04 12:56:31',
               '2021-07-05 12:56:31', '2021-07-06 12:56:31',
               '2021-07-07 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [65]:
#normalizar horários com normalize
pd.date_range('2021-08-02 12:56:31', periods=5, normalize=True)

DatetimeIndex(['2021-08-02', '2021-08-03', '2021-08-04', '2021-08-05',
               '2021-08-06'],
              dtype='datetime64[ns]', freq='D')

**Frequências e offset de datas**