In [46]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/Consumo_cerveja.csv', 
                 decimal=',', 
                 thousands='.', 
                 header=0, 
                 names=['date','median_temp','min_temp','max_temp','precip','weekend','consumption'], 
                 parse_dates=['date'], 
                 nrows=365)


# Mapping

In [47]:
season = {
    "winter": [12, 1, 2],
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "autumn": [9, 10,11]
}

In [48]:
season_map = {i: k
              for k, v in season.items()
              for i in v
             }
season_map

{12: 'winter',
 1: 'winter',
 2: 'winter',
 3: 'spring',
 4: 'spring',
 5: 'spring',
 6: 'summer',
 7: 'summer',
 8: 'summer',
 9: 'autumn',
 10: 'autumn',
 11: 'autumn'}

In [49]:
df['season'] = df.date.dt.month.map(season_map)

In [50]:
df.season.value_counts()

summer    92
spring    92
autumn    91
winter    90
Name: season, dtype: int64

# Where

In [52]:
df['consumption_bin'] = np.where(df.consumption < 25000, 'low', 'high')
df

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption,season,consumption_bin
0,2015-01-01,27.30,23.9,32.5,0.0,0,25461,winter,high
1,2015-01-02,27.02,24.5,33.5,0.0,0,28972,winter,high
2,2015-01-03,24.82,22.4,29.9,0.0,1,30814,winter,high
3,2015-01-04,23.98,21.5,28.6,1.2,1,29799,winter,high
4,2015-01-05,23.82,21.0,28.3,0.0,0,28900,winter,high
5,2015-01-06,23.78,20.1,30.5,12.2,0,28218,winter,high
6,2015-01-07,24.00,19.5,33.7,0.0,0,29732,winter,high
7,2015-01-08,24.90,19.5,32.8,48.6,0,28397,winter,high
8,2015-01-09,28.20,21.9,34.0,4.4,0,24886,winter,low
9,2015-01-10,26.76,22.1,34.2,0.0,1,37937,winter,high


In [61]:
pd.cut(df.consumption, bins=2, labels=['low', 'high'])

0       low
1      high
2      high
3      high
4      high
5      high
6      high
7      high
8       low
9      high
10     high
11      low
12     high
13     high
14      low
15     high
16     high
17     high
18     high
19     high
20     high
21      low
22      low
23     high
24     high
25      low
26     high
27      low
28      low
29     high
       ... 
335    high
336    high
337    high
338    high
339    high
340     low
341    high
342    high
343     low
344    high
345    high
346    high
347    high
348    high
349     low
350     low
351    high
352    high
353    high
354     low
355    high
356    high
357    high
358    high
359     low
360    high
361     low
362     low
363     low
364     low
Name: consumption, Length: 365, dtype: category
Categories (2, object): [low < high]

In [64]:
df['consumption_group'] = pd.qcut(df.consumption, q=3, labels=['low', 'medium', 'high'])

get_dummies

In [67]:
pd.get_dummies(df)

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption,season_autumn,season_spring,season_summer,season_winter,consumption_bin_high,consumption_bin_low,consumption_group_low,consumption_group_medium,consumption_group_high
0,2015-01-01,27.30,23.9,32.5,0.0,0,25461,0,0,0,1,1,0,0,1,0
1,2015-01-02,27.02,24.5,33.5,0.0,0,28972,0,0,0,1,1,0,0,0,1
2,2015-01-03,24.82,22.4,29.9,0.0,1,30814,0,0,0,1,1,0,0,0,1
3,2015-01-04,23.98,21.5,28.6,1.2,1,29799,0,0,0,1,1,0,0,0,1
4,2015-01-05,23.82,21.0,28.3,0.0,0,28900,0,0,0,1,1,0,0,0,1
5,2015-01-06,23.78,20.1,30.5,12.2,0,28218,0,0,0,1,1,0,0,0,1
6,2015-01-07,24.00,19.5,33.7,0.0,0,29732,0,0,0,1,1,0,0,0,1
7,2015-01-08,24.90,19.5,32.8,48.6,0,28397,0,0,0,1,1,0,0,0,1
8,2015-01-09,28.20,21.9,34.0,4.4,0,24886,0,0,0,1,0,1,0,1,0
9,2015-01-10,26.76,22.1,34.2,0.0,1,37937,0,0,0,1,1,0,0,0,1


In [69]:
pd.get_dummies(df, drop_first=True, columns=['season', 'consumption_group'])

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption,consumption_bin,season_spring,season_summer,season_winter,consumption_group_medium,consumption_group_high
0,2015-01-01,27.30,23.9,32.5,0.0,0,25461,high,0,0,1,1,0
1,2015-01-02,27.02,24.5,33.5,0.0,0,28972,high,0,0,1,0,1
2,2015-01-03,24.82,22.4,29.9,0.0,1,30814,high,0,0,1,0,1
3,2015-01-04,23.98,21.5,28.6,1.2,1,29799,high,0,0,1,0,1
4,2015-01-05,23.82,21.0,28.3,0.0,0,28900,high,0,0,1,0,1
5,2015-01-06,23.78,20.1,30.5,12.2,0,28218,high,0,0,1,0,1
6,2015-01-07,24.00,19.5,33.7,0.0,0,29732,high,0,0,1,0,1
7,2015-01-08,24.90,19.5,32.8,48.6,0,28397,high,0,0,1,0,1
8,2015-01-09,28.20,21.9,34.0,4.4,0,24886,low,0,0,1,1,0
9,2015-01-10,26.76,22.1,34.2,0.0,1,37937,high,0,0,1,0,1


In [74]:
df.consumption.diff()

0          NaN
1       3511.0
2       1842.0
3      -1015.0
4       -899.0
5       -682.0
6       1514.0
7      -1335.0
8      -3511.0
9      13051.0
10     -1683.0
11    -10511.0
12      1247.0
13      4835.0
14     -6101.0
15      4214.0
16      7752.0
17     -7166.0
18     -1259.0
19      5862.0
20     -5997.0
21     -3335.0
22     -4011.0
23      6564.0
24      2740.0
25     -9568.0
26      8452.0
27     -7369.0
28        93.0
29      4149.0
        ...   
335     5942.0
336    -2066.0
337     1108.0
338     2938.0
339      329.0
340    -9405.0
341     4338.0
342     -576.0
343    -4204.0
344     7807.0
345    -1161.0
346     -391.0
347    -1057.0
348      486.0
349    -7555.0
350     3275.0
351     2705.0
352     5494.0
353    -2409.0
354    -5293.0
355     1994.0
356     -360.0
357     5104.0
358    -5264.0
359    -4353.0
360    10352.0
361    -6212.0
362    -3786.0
363    -1842.0
364     1979.0
Name: consumption, Length: 365, dtype: float64

In [73]:
df.consumption.pct_change(periods=7)

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7      0.115314
8     -0.141033
9      0.231161
10     0.216618
11    -0.109239
12    -0.043518
13     0.070396
14    -0.094130
15     0.203006
16    -0.006511
17    -0.158052
18     0.136814
19     0.301482
20    -0.084682
21     0.002760
22    -0.272363
23    -0.247864
24     0.018477
25    -0.264651
26    -0.146753
27    -0.224065
28    -0.120140
29     0.232326
         ...   
335    0.213066
336    0.123393
337    0.018427
338   -0.056163
339    0.070647
340    0.155005
341    0.129806
342   -0.109416
343   -0.192642
344    0.041575
345   -0.088503
346   -0.109579
347    0.203465
348    0.032620
349   -0.223864
350    0.061222
351   -0.120299
352    0.099970
353    0.032171
354   -0.117202
355   -0.062515
356    0.256671
357    0.297284
358   -0.027143
359   -0.325209
360    0.072360
361    0.050777
362   -0.168443
363   -0.226727
364   -0.289054
Name: consumption, Lengt

In [82]:
df.shift(1)

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption,season,consumption_bin,consumption_group
0,NaT,,,,,,,,,
1,2015-01-01,27.30,23.9,32.5,0.0,0.0,25461.0,winter,high,medium
2,2015-01-02,27.02,24.5,33.5,0.0,0.0,28972.0,winter,high,high
3,2015-01-03,24.82,22.4,29.9,0.0,1.0,30814.0,winter,high,high
4,2015-01-04,23.98,21.5,28.6,1.2,1.0,29799.0,winter,high,high
5,2015-01-05,23.82,21.0,28.3,0.0,0.0,28900.0,winter,high,high
6,2015-01-06,23.78,20.1,30.5,12.2,0.0,28218.0,winter,high,high
7,2015-01-07,24.00,19.5,33.7,0.0,0.0,29732.0,winter,high,high
8,2015-01-08,24.90,19.5,32.8,48.6,0.0,28397.0,winter,high,high
9,2015-01-09,28.20,21.9,34.0,4.4,0.0,24886.0,winter,low,medium


In [76]:
df.consumption.head()

0    25461
1    28972
2    30814
3    29799
4    28900
Name: consumption, dtype: int64

In [81]:
pd.concat([df.consumption.shift(i).rename(f't_{-i}') for i in range(5)], axis=1)

Unnamed: 0,t_0,t_-1,t_-2,t_-3,t_-4
0,25461,,,,
1,28972,25461.0,,,
2,30814,28972.0,25461.0,,
3,29799,30814.0,28972.0,25461.0,
4,28900,29799.0,30814.0,28972.0,25461.0
5,28218,28900.0,29799.0,30814.0,28972.0
6,29732,28218.0,28900.0,29799.0,30814.0
7,28397,29732.0,28218.0,28900.0,29799.0
8,24886,28397.0,29732.0,28218.0,28900.0
9,37937,24886.0,28397.0,29732.0,28218.0
