In [1]:
import pandas as pd

df = pd.read_csv("https://calmcode.io/datasets/birthdays.csv")

In [9]:
# convert data from object to datetime64
df.assign(date = lambda d: pd.to_datetime(d['date'])).dtypes


state             object
year               int64
month              int64
day                int64
date      datetime64[ns]
wday              object
births             int64
dtype: object

In [10]:
df.assign(date = lambda d: pd.to_datetime(d['date'],format='%Y-%m-%d')).dtypes

state             object
year               int64
month              int64
day                int64
date      datetime64[ns]
wday              object
births             int64
dtype: object

In [20]:
# if have a column of type datetime, can unpack datetime data
(df.assign(date = lambda d: pd.to_datetime(d['date']),
          day_of_week = lambda d: d['date'].dt.day_name(),
          minute = lambda d: d['date'].dt.minute,
          nanosecond = lambda d: d['date'].dt.nanosecond,
          day_of_year = lambda d: d['date'].dt.day_of_year,
          month_manual = lambda d: d['date'].dt.month,
          week = lambda d: d['date'].dt.isocalendar().week))
          

Unnamed: 0,state,year,month,day,date,wday,births,day_of_week,minute,nanosecond,day_of_year,month_manual,week
0,AK,1969,1,1,1969-01-01,Wed,14,Wednesday,0,0,1,1,1
1,AL,1969,1,1,1969-01-01,Wed,174,Wednesday,0,0,1,1,1
2,AR,1969,1,1,1969-01-01,Wed,78,Wednesday,0,0,1,1,1
3,AZ,1969,1,1,1969-01-01,Wed,84,Wednesday,0,0,1,1,1
4,CA,1969,1,1,1969-01-01,Wed,824,Wednesday,0,0,1,1,1
5,CO,1969,1,1,1969-01-01,Wed,100,Wednesday,0,0,1,1,1
6,CT,1969,1,1,1969-01-01,Wed,90,Wednesday,0,0,1,1,1
7,DC,1969,1,1,1969-01-01,Wed,88,Wednesday,0,0,1,1,1
8,DE,1969,1,1,1969-01-01,Wed,32,Wednesday,0,0,1,1,1
9,FL,1969,1,1,1969-01-01,Wed,288,Wednesday,0,0,1,1,1


In [21]:
subset_df = (df
            .assign(date=lambda d: pd.to_datetime(d['date'], format="%Y-%m-%d"))
            [['state', 'date', 'births']]
            .assign(year = lambda d: d['date'].dt.year))
subset_df

Unnamed: 0,state,date,births,year
0,AK,1969-01-01,14,1969
1,AL,1969-01-01,174,1969
2,AR,1969-01-01,78,1969
3,AZ,1969-01-01,84,1969
4,CA,1969-01-01,824,1969
5,CO,1969-01-01,100,1969
6,CT,1969-01-01,90,1969
7,DC,1969-01-01,88,1969
8,DE,1969-01-01,32,1969
9,FL,1969-01-01,288,1969


In [28]:
subset_df = (df
            .assign(date=lambda d: pd.to_datetime(d['date'], format="%Y-%m-%d"))
            [['state', 'date', 'births']]
            .loc[lambda d: d['state']=='CA'])

 # resample by 2 weeks and sum
subset_df.set_index('date').resample('2W').sum().head()

Unnamed: 0_level_0,births
date,Unnamed: 1_level_1
1969-01-05,4290
1969-01-19,12942
1969-02-02,12784
1969-02-16,12932
1969-03-02,13198


In [27]:

 # resample by 1 month and sum
subset_df.set_index('date').resample('M').sum().head()

Unnamed: 0_level_0,births
date,Unnamed: 1_level_1
1969-01-31,28386
1969-02-28,26052
1969-03-31,29488
1969-04-30,28668
1969-05-31,29806


In [29]:
subset_df = (df
            .assign(date=lambda d: pd.to_datetime(d['date'], format="%Y-%m-%d"))
            [['state', 'date', 'births']])


 # resample by 1 year and sum
subset_df.set_index('date').resample('Y').sum().head()

Unnamed: 0_level_0,births
date,Unnamed: 1_level_1
1969-12-31,3599036
1970-12-31,3734914
1971-12-31,3563126
1972-12-31,3265895
1973-12-31,3145703


In [30]:
 # resample by 1 year, groubpy and sum
subset_df.set_index('date').groupby('state').resample('Y').sum().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,births
state,date,Unnamed: 2_level_1
AK,1969-12-31,6824
AK,1970-12-31,7464
AK,1971-12-31,7244
AK,1972-12-31,6892
AK,1973-12-31,6566


## Visualize

In [31]:
subset_df = (df
            .assign(date=lambda d: pd.to_datetime(d['date'], format="%Y-%m-%d"))
            [['state', 'date', 'births']]
            .loc[lambda d: d['state']=='CA']
            .tail(365*2))


In [32]:
subset_df.assign(rolling_births = lambda d: d.rolling(10, min_periods=1).mean()).head(15)

Unnamed: 0,state,date,births,rolling_births
335329,CA,1987-01-02,1230,1230.0
335380,CA,1987-01-03,1178,1204.0
335431,CA,1987-01-04,1087,1165.0
335482,CA,1987-01-05,1300,1198.75
335533,CA,1987-01-06,1415,1242.0
335584,CA,1987-01-07,1318,1254.666667
335635,CA,1987-01-08,1310,1262.571429
335686,CA,1987-01-09,1430,1283.5
335737,CA,1987-01-10,1131,1266.555556
335788,CA,1987-01-11,1123,1252.2


In [33]:
import altair as alt

(alt.Chart(subset_df)
    .mark_line()
    .encode(x='date', y='births')
    .properties(width=600, height=250)
    .interactive())

In [43]:
plot_df = (subset_df
           .set_index('date')
           .assign(rolling_births = lambda d: d['births'].rolling('20D', min_periods=1).mean())
           .reset_index())




In [44]:
p1 = (alt.Chart(plot_df)
     .mark_line()
     .encode(x='date', y='births')
     .properties(width=600, height=250)
     .interactive())

p2 = (alt.Chart(plot_df)
     .mark_line(color='red')
     .encode(x='date', y='rolling_births')
     .properties(width=600, height=250)
     .interactive())

p1 + p2

## Rolling groupby for every state

In [50]:
subset_df = (df
            .assign(date=lambda d: pd.to_datetime(d['date'], format="%Y-%m-%d"))
            [['state', 'date', 'births']])

In [55]:
def calc_rolling_mean(dataf, column=None, setting='30D'):
    return (dataf
           .groupby('state')[column]
           .transform(lambda d: d.rolling(setting, min_periods=1).mean()))

(subset_df
     .set_index('date')
     .assign(rolling_births = lambda d: calc_rolling_mean(d, column='births'))
     .reset_index()
     .sort_values(['state', 'date']))

Unnamed: 0,date,state,births,rolling_births
0,1969-01-01,AK,14,14.000000
51,1969-01-02,AK,20,17.000000
102,1969-01-03,AK,20,18.000000
153,1969-01-04,AK,16,17.500000
204,1969-01-05,AK,18,17.600000
255,1969-01-06,AK,24,18.666667
306,1969-01-07,AK,16,18.285714
357,1969-01-08,AK,8,17.000000
408,1969-01-09,AK,16,16.888889
459,1969-01-10,AK,20,17.200000


In [56]:
(subset_df
     .set_index('date')
     .assign(rolling_births = lambda d: calc_rolling_mean(d, column='births'))
     .reset_index()
     .sort_values(['state', 'date'])
     .loc[lambda d: d['state']=='CA'])

Unnamed: 0,date,state,births,rolling_births
4,1969-01-01,CA,824,824.000000
55,1969-01-02,CA,816,820.000000
106,1969-01-03,CA,940,860.000000
157,1969-01-04,CA,906,871.500000
208,1969-01-05,CA,804,858.000000
259,1969-01-06,CA,922,868.666667
310,1969-01-07,CA,866,868.285714
361,1969-01-08,CA,954,879.000000
412,1969-01-09,CA,978,890.000000
463,1969-01-10,CA,928,893.800000
