In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/national_covid19_age_range.csv")
df.head()

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases
0,2020-03-23,0-9,both,129,34,1,0
1,2020-03-23,10-19,both,221,15,0,1
2,2020-03-23,20-29,both,1285,183,8,4
3,2020-03-23,30-39,both,2208,365,15,3
4,2020-03-23,40-49,both,2919,663,40,9


In [3]:
df.index

RangeIndex(start=0, stop=1878, step=1)

In [4]:
# Good indexes have all the values unique

In [5]:
df.loc[0]

date               2020-03-23
age_range                 0-9
sex                      both
cases_confirmed           129
hospitalized               34
icu                         1
deceases                    0
Name: 0, dtype: object

In [7]:
young_women = df.loc[
    (df["sex"] == "women")
    & (df["age_range"] == "20-29")
]
young_women.head()

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases
12,2020-03-23,20-29,women,724,78,4,1
43,2020-03-24,20-29,women,751,87,5,1
76,2020-03-25,20-29,women,803,99,5,1
109,2020-03-26,20-29,women,1139,144,7,0
142,2020-03-27,20-29,women,1186,132,7,0


In [8]:
young_women["date"].nunique() == len(young_women)

True

In [11]:
young_women.loc[young_women["date"] == "2020-04-01"]

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases
307,2020-04-01,20-29,women,1840,232,10,0


In [10]:
# inplace=True  (I oppose using them)
young_women_indexed = young_women.set_index("date")
young_women_indexed.head()

Unnamed: 0_level_0,age_range,sex,cases_confirmed,hospitalized,icu,deceases
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-23,20-29,women,724,78,4,1
2020-03-24,20-29,women,751,87,5,1
2020-03-25,20-29,women,803,99,5,1
2020-03-26,20-29,women,1139,144,7,0
2020-03-27,20-29,women,1186,132,7,0


Advantages of the index

- More easily extract a particular row
- Performance improvements

In [17]:
young_women_indexed.loc["2020-04-01"]

age_range          20-29
sex                women
cases_confirmed     1840
hospitalized         232
icu                   10
deceases               0
Name: 2020-04-01, dtype: object

In [29]:
# Date ISO 8601

In [27]:
young_women_indexed.loc["2020-03":"2020-04"]

Unnamed: 0_level_0,age_range,sex,cases_confirmed,hospitalized,icu,deceases
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-23,20-29,women,724,78,4,1
2020-03-24,20-29,women,751,87,5,1
2020-03-25,20-29,women,803,99,5,1
2020-03-26,20-29,women,1139,144,7,0
2020-03-27,20-29,women,1186,132,7,0
2020-03-28,20-29,women,1228,140,8,0
2020-03-29,20-29,women,1693,218,9,0
2020-03-30,20-29,women,1749,221,9,0
2020-03-31,20-29,women,1808,229,10,0


In [42]:
df.head()

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases
0,2020-03-23,0-9,both,129,34,1,0
1,2020-03-23,10-19,both,221,15,0,1
2,2020-03-23,20-29,both,1285,183,8,4
3,2020-03-23,30-39,both,2208,365,15,3
4,2020-03-23,40-49,both,2919,663,40,9


In [32]:
median_cases_by_age = df.groupby("age_range")["cases_confirmed"].median()
median_cases_by_age

age_range
0-9         292.0
10-19       538.0
20-29      4237.0
30-39      7684.0
40-49     12892.0
50-59     16455.0
60-69     14717.0
70-79     13299.0
80 y +     1666.0
80-89     13932.5
90 y +     4571.0
Total     90924.0
Name: cases_confirmed, dtype: float64

In [43]:
median_cases_by_age.index

Index(['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79',
       '80 y +', '80-89', '90 y +', 'Total'],
      dtype='object', name='age_range')

In [65]:
df.merge(
    median_cases_by_age,
    # how="left",
    left_on="age_range",
    right_index=True,
    suffixes=("", "_median_by_age"),
)

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases,cases_confirmed_median_by_age
0,2020-03-23,0-9,both,129,34,1,0,292.0
10,2020-03-23,0-9,women,64,15,0,0,292.0
20,2020-03-23,0-9,men,65,19,1,0,292.0
30,2020-03-24,0-9,both,130,35,1,0,292.0
41,2020-03-24,0-9,women,65,15,0,0,292.0
...,...,...,...,...,...,...,...,...
1832,2020-05-17,90 y +,women,13195,3889,35,2493,4571.0
1843,2020-05-17,90 y +,men,5105,2455,21,1589,4571.0
1854,2020-05-22,90 y +,both,19888,6369,57,4417,4571.0
1865,2020-05-22,90 y +,women,14289,3863,35,2727,4571.0


In [58]:
median_cases_by_age_df = median_cases_by_age.to_frame().reset_index()
median_cases_by_age_df.head()

Unnamed: 0,age_range,cases_confirmed
0,0-9,292.0
1,10-19,538.0
2,20-29,4237.0
3,30-39,7684.0
4,40-49,12892.0


In [76]:
df["icu"].max()  # .max() is a method

7910

In [77]:
max(df["icu"])  # max() is a function

7910

In [78]:
df.merge(
    median_cases_by_age_df,
    how="left",
    on="age_range",
    suffixes=("", "_median_by_age")
).head()

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases,cases_confirmed_median_by_age
0,2020-03-23,0-9,both,129,34,1,0,292.0
1,2020-03-23,10-19,both,221,15,0,1,538.0
2,2020-03-23,20-29,both,1285,183,8,4,4237.0
3,2020-03-23,30-39,both,2208,365,15,3,7684.0
4,2020-03-23,40-49,both,2919,663,40,9,12892.0


In [85]:
median_cases_confirmed_alt = df.groupby("age_range")["cases_confirmed"].transform("median")
median_cases_confirmed_alt.head()

0      292.0
1      538.0
2     4237.0
3     7684.0
4    12892.0
Name: cases_confirmed, dtype: float64

In [86]:
len(median_cases_confirmed_alt)

1878

In [93]:
df["median_cases_confirmed_alt"] = median_cases_confirmed_alt.astype(int)

In [95]:
df["median_cases_confirmed_alt2"] = median_cases_by_age

In [96]:
df.head()

Unnamed: 0,date,age_range,sex,cases_confirmed,hospitalized,icu,deceases,median_cases_confirmed_alt,median_cases_confirmed_alt2
0,2020-03-23,0-9,both,129,34,1,0,292,
1,2020-03-23,10-19,both,221,15,0,1,538,
2,2020-03-23,20-29,both,1285,183,8,4,4237,
3,2020-03-23,30-39,both,2208,365,15,3,7684,
4,2020-03-23,40-49,both,2919,663,40,9,12892,
