# Pandas for beer - Drinking patterns in Sao Paoulo

In [177]:
%matplotlib inline

In [1]:
import pandas as pd


# Reading data

Pandas has a fantastic ability to read data files

# Old way

In [27]:
import csv

with open('data/Consumo_cerveja.csv') as f:
    reader = csv.DictReader(f)
    data = [line for line in reader]

In [34]:
data[0]

OrderedDict([('Data', '2015-01-01'),
             ('Temperatura Media (C)', '27,3'),
             ('Temperatura Minima (C)', '23,9'),
             ('Temperatura Maxima (C)', '32,5'),
             ('Precipitacao (mm)', '0'),
             ('Final de Semana', '0'),
             ('Consumo de cerveja (litros)', '25.461')])

# Pandas way

In [43]:
df = pd.read_csv('data/Consumo_cerveja.csv')
df.head()

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.9


translate names

In [96]:
translated_names = ['date',
                    'median_temp',
                    'min_temp',
                    'max_temp',
                    'precip',
                    'weekend',
                    'consumption']

In [98]:
df = pd.read_csv('data/Consumo_cerveja.csv', header=0, names=translated_names)
df.head()

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.9


# Data types

In [99]:
df.dtypes

date            object
median_temp     object
min_temp        object
max_temp        object
precip          object
weekend        float64
consumption    float64
dtype: object

the temperatures are definitely numbers and not 'object' - what's wrong here?

In [102]:
df = pd.read_csv('data/Consumo_cerveja.csv', header=0, names=translated_names, decimal=',', thousands='.')

In [103]:
df.dtypes

date            object
median_temp    float64
min_temp       float64
max_temp       float64
precip         float64
weekend        float64
consumption    float64
dtype: object

In [104]:
df.head()

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
0,2015-01-01,27.3,23.9,32.5,0.0,0.0,25461.0
1,2015-01-02,27.02,24.5,33.5,0.0,0.0,28972.0
2,2015-01-03,24.82,22.4,29.9,0.0,1.0,30814.0
3,2015-01-04,23.98,21.5,28.6,1.2,1.0,29799.0
4,2015-01-05,23.82,21.0,28.3,0.0,0.0,28900.0


In [105]:
df.tail()

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
936,,,,,,,
937,,,,,,,
938,,,,,,,
939,,,,,,,
940,,,,,,,


Looks like some dirty data - what's gone wrong here?

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 7 columns):
date           365 non-null object
median_temp    365 non-null float64
min_temp       365 non-null float64
max_temp       365 non-null float64
precip         365 non-null float64
weekend        365 non-null float64
consumption    365 non-null object
dtypes: float64(5), object(2)
memory usage: 51.5+ KB


In [80]:
df.describe()

Unnamed: 0,median_temp,min_temp,max_temp,precip,weekend
count,365.0,365.0,365.0,365.0,365.0
mean,21.226356,17.46137,26.611507,5.196712,0.284932
std,3.180108,2.826185,4.317366,12.417844,0.452001
min,12.9,10.6,14.5,0.0,0.0
25%,19.02,15.3,23.8,0.0,0.0
50%,21.38,17.9,26.9,0.0,0.0
75%,23.28,19.6,29.4,3.2,1.0
max,28.86,24.5,36.5,94.8,1.0


In [109]:
df = pd.read_csv('data/Consumo_cerveja.csv', decimal=',', thousands='.', header=0, names=translated_names, parse_dates=['date'], nrows=365)

# Indexing

In [110]:
df['median_temp']

0      27.30
1      27.02
2      24.82
3      23.98
4      23.82
5      23.78
6      24.00
7      24.90
8      28.20
9      26.76
10     27.62
11     25.96
12     25.52
13     25.96
14     25.86
15     26.50
16     28.86
17     28.26
18     28.22
19     27.68
20     25.32
21     21.74
22     21.04
23     23.12
24     24.40
25     22.40
26     23.60
27     25.68
28     25.00
29     22.80
       ...  
335    22.10
336    22.44
337    22.76
338    24.80
339    23.12
340    20.04
341    21.70
342    23.96
343    24.00
344    24.04
345    23.92
346    24.54
347    26.28
348    25.66
349    22.04
350    23.32
351    26.42
352    23.74
353    22.84
354    23.12
355    24.60
356    27.46
357    24.72
358    23.58
359    23.34
360    24.00
361    22.64
362    21.68
363    21.38
364    24.76
Name: median_temp, Length: 365, dtype: float64

In [111]:
df[['median_temp', 'max_temp']]

Unnamed: 0,median_temp,max_temp
0,27.30,32.5
1,27.02,33.5
2,24.82,29.9
3,23.98,28.6
4,23.82,28.3
5,23.78,30.5
6,24.00,33.7
7,24.90,32.8
8,28.20,34.0
9,26.76,34.2


In [114]:
df.loc[:, 'median_temp']

0      27.30
1      27.02
2      24.82
3      23.98
4      23.82
5      23.78
6      24.00
7      24.90
8      28.20
9      26.76
10     27.62
11     25.96
12     25.52
13     25.96
14     25.86
15     26.50
16     28.86
17     28.26
18     28.22
19     27.68
20     25.32
21     21.74
22     21.04
23     23.12
24     24.40
25     22.40
26     23.60
27     25.68
28     25.00
29     22.80
       ...  
335    22.10
336    22.44
337    22.76
338    24.80
339    23.12
340    20.04
341    21.70
342    23.96
343    24.00
344    24.04
345    23.92
346    24.54
347    26.28
348    25.66
349    22.04
350    23.32
351    26.42
352    23.74
353    22.84
354    23.12
355    24.60
356    27.46
357    24.72
358    23.58
359    23.34
360    24.00
361    22.64
362    21.68
363    21.38
364    24.76
Name: median_temp, Length: 365, dtype: float64

In [115]:
df.loc[0, 'median_temp']

27.300000000000001

In [116]:
df.loc[0, :]

date           2015-01-01 00:00:00
median_temp                   27.3
min_temp                      23.9
max_temp                      32.5
precip                           0
weekend                          0
consumption                  25461
Name: 0, dtype: object

In [121]:
df.loc[0, ['median_temp', 'min_temp']]

median_temp    27.3
min_temp       23.9
Name: 0, dtype: object

In [119]:
df.iloc[0, 1]

27.300000000000001

In [120]:
df.iloc[0, [1, 2]]

median_temp    27.3
min_temp       23.9
Name: 0, dtype: object

# Boolean indexing

In [128]:
df[df['min_temp'] > 23]

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
0,2015-01-01,27.3,23.9,32.5,0.0,0,25461
1,2015-01-02,27.02,24.5,33.5,0.0,0,28972
17,2015-01-18,28.26,23.4,35.6,0.0,1,30524
19,2015-01-20,27.68,23.3,35.6,0.6,0,35127
42,2015-02-12,27.66,23.1,32.7,0.0,0,26389
322,2015-11-19,26.16,23.3,30.4,0.0,0,22960


In [129]:
df[df['weekend'] == 1]

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
2,2015-01-03,24.82,22.4,29.9,0.0,1,30814
3,2015-01-04,23.98,21.5,28.6,1.2,1,29799
9,2015-01-10,26.76,22.1,34.2,0.0,1,37937
10,2015-01-11,27.62,22.2,34.8,3.4,1,36254
16,2015-01-17,28.86,22.0,35.8,0.0,1,37690
17,2015-01-18,28.26,23.4,35.6,0.0,1,30524
23,2015-01-24,23.12,19.0,29.4,13.0,1,28348
24,2015-01-25,24.40,18.1,30.0,0.0,1,31088
30,2015-01-31,21.64,18.5,24.3,0.2,1,27030
31,2015-02-01,24.16,20.6,28.0,0.0,1,32057


In [131]:
df[(df['weekend'] == 1) & (df['min_temp'] > 23)]

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
17,2015-01-18,28.26,23.4,35.6,0.0,1,30524


In [134]:
df[(df['min_temp'] > 23) | (df['weekend'] == 1)]

Unnamed: 0,date,median_temp,min_temp,max_temp,precip,weekend,consumption
0,2015-01-01,27.30,23.9,32.5,0.0,0,25461
1,2015-01-02,27.02,24.5,33.5,0.0,0,28972
2,2015-01-03,24.82,22.4,29.9,0.0,1,30814
3,2015-01-04,23.98,21.5,28.6,1.2,1,29799
9,2015-01-10,26.76,22.1,34.2,0.0,1,37937
10,2015-01-11,27.62,22.2,34.8,3.4,1,36254
16,2015-01-17,28.86,22.0,35.8,0.0,1,37690
17,2015-01-18,28.26,23.4,35.6,0.0,1,30524
19,2015-01-20,27.68,23.3,35.6,0.6,0,35127
23,2015-01-24,23.12,19.0,29.4,13.0,1,28348


# Operations

In [154]:
temperatures = df.loc[:, ['min_temp', 'max_temp']]

In [155]:
temperatures.mean()

min_temp    17.461370
max_temp    26.611507
dtype: float64

In [156]:
temperatures.mean(axis='index')

min_temp    17.461370
max_temp    26.611507
dtype: float64

In [157]:
temperatures.mean(axis='columns')

0      28.20
1      29.00
2      26.15
3      25.05
4      24.65
5      25.30
6      26.60
7      26.15
8      27.95
9      28.15
10     28.50
11     28.40
12     28.00
13     27.65
14     27.15
15     27.50
16     28.90
17     29.50
18     29.60
19     29.45
20     26.80
21     22.65
22     22.30
23     24.20
24     24.05
25     23.80
26     24.60
27     25.00
28     25.35
29     23.90
       ...  
335    23.80
336    23.15
337    24.05
338    25.05
339    24.30
340    20.95
341    23.00
342    23.75
343    24.95
344    25.50
345    25.70
346    25.15
347    26.80
348    26.70
349    22.75
350    23.80
351    26.55
352    25.55
353    24.30
354    24.35
355    26.00
356    27.25
357    26.00
358    24.40
359    23.80
360    24.65
361    23.90
362    22.20
363    20.85
364    24.60
Length: 365, dtype: float64

Where did mean go?

In [160]:
temperatures

Unnamed: 0,min_temp,max_temp,mean
0,23.9,32.5,28.20
1,24.5,33.5,29.00
2,22.4,29.9,26.15
3,21.5,28.6,25.05
4,21.0,28.3,24.65
5,20.1,30.5,25.30
6,19.5,33.7,26.60
7,19.5,32.8,26.15
8,21.9,34.0,27.95
9,22.1,34.2,28.15


In [159]:
temperatures['mean'] = temperatures.mean(axis='columns')

How far away is the mean from the median?

In [161]:
df['median_temp'] - temperatures['mean']

0     -0.90
1     -1.98
2     -1.33
3     -1.07
4     -0.83
5     -1.52
6     -2.60
7     -1.25
8      0.25
9     -1.39
10    -0.88
11    -2.44
12    -2.48
13    -1.69
14    -1.29
15    -1.00
16    -0.04
17    -1.24
18    -1.38
19    -1.77
20    -1.48
21    -0.91
22    -1.26
23    -1.08
24     0.35
25    -1.40
26    -1.00
27     0.68
28    -0.35
29    -1.10
       ... 
335   -1.70
336   -0.71
337   -1.29
338   -0.25
339   -1.18
340   -0.91
341   -1.30
342    0.21
343   -0.95
344   -1.46
345   -1.78
346   -0.61
347   -0.52
348   -1.04
349   -0.71
350   -0.48
351   -0.13
352   -1.81
353   -1.46
354   -1.23
355   -1.40
356    0.21
357   -1.28
358   -0.82
359   -0.46
360   -0.65
361   -1.26
362   -0.52
363    0.53
364    0.16
Length: 365, dtype: float64