In [1]:
import numpy as np
import pandas as pd 

# 1 - Convert a column with date information to a date  

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/alta-noaa-1980-2019.csv'

In [72]:
alta_df = pd.read_csv(url,engine='pyarrow',dtype_backend='pyarrow')
dates = pd.to_datetime(alta_df.DATE)

In [73]:
snow = alta_df.SNOW.rename(dates)

In [74]:
snow.sample(10)

2010-12-04    0.0
2018-10-30    0.4
2014-01-02    0.0
2003-04-22    0.0
2014-03-11    8.0
2004-07-22    0.0
1987-05-16    0.0
2019-08-23    0.0
2008-10-24    0.0
2007-07-15    0.0
Name: SNOW, dtype: double[pyarrow]

# 2 - Put the date information into the index for a numeric column

# 3 - Calculate the average value of the column for each month 

In [75]:
snow.resample('2ME').mean()

1980-01-31    4.645161
1980-03-31       3.925
1980-05-31    1.262295
1980-07-31    0.016393
1980-09-30         0.0
                ...   
2019-01-31    2.459016
2019-03-31    3.459322
2019-05-31    1.342623
2019-07-31         0.0
2019-09-30         0.0
Freq: 2ME, Name: SNOW, Length: 239, dtype: double[pyarrow]

# 4 - Calculate the average value of the column for every two months 

In [76]:
snow.resample("2ME").mean()

1980-01-31    4.645161
1980-03-31       3.925
1980-05-31    1.262295
1980-07-31    0.016393
1980-09-30         0.0
                ...   
2019-01-31    2.459016
2019-03-31    3.459322
2019-05-31    1.342623
2019-07-31         0.0
2019-09-30         0.0
Freq: 2ME, Name: SNOW, Length: 239, dtype: double[pyarrow]

# 5- Calculate the percentage of the column out of the total for each month

In [77]:
def calculate_pct(s):
    return s.div(s.sum()).mul(100)

snow.groupby(snow.index.to_period("M")).apply(calculate_pct)


1980-01  1980-01-01    1.388889
         1980-01-02    2.083333
         1980-01-03    0.694444
         1980-01-04         0.0
         1980-01-05         0.0
                         ...   
2019-09  2019-09-03         NaN
         2019-09-04         NaN
         2019-09-05         NaN
         2019-09-06         NaN
         2019-09-07         NaN
Name: SNOW, Length: 14160, dtype: double[pyarrow]

# 6 - Calculate the average value of the column for a rolling window of size 7

In [78]:
snow.rolling(window=7).mean()

1980-01-01    NaN
1980-01-02    NaN
1980-01-03    NaN
1980-01-04    NaN
1980-01-05    NaN
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

# 7 - Using .loc pull out the first three months of a year

In [79]:
year = pd.Series(snow.index.year)
year = year.drop_duplicates().astype('category')
year

0        1980
366      1981
731      1982
1096     1983
1461     1984
1827     1985
2192     1986
2556     1987
2921     1988
3286     1989
3651     1990
4015     1991
4379     1992
4714     1993
5078     1994
5413     1995
5777     1996
6141     1997
6505     1998
6868     1999
7232     2000
7598     2001
7932     2002
8295     2003
8659     2004
8994     2005
9359     2006
9694     2007
9996     2008
10362    2009
10720    2010
11076    2011
11414    2012
11761    2013
12103    2014
12460    2015
12816    2016
13182    2017
13546    2018
13911    2019
dtype: category
Categories (40, int32): [1980, 1981, 1982, 1983, ..., 2016, 2017, 2018, 2019]

In [80]:
snow.loc[(snow.index.month <= 3) & (snow.index.year == 2010)]

2010-01-01     2.0
2010-01-02     2.5
2010-01-03     0.0
2010-01-04     0.0
2010-01-06    <NA>
              ... 
2010-03-26     7.0
2010-03-27     0.0
2010-03-28     0.0
2010-03-29     0.0
2010-03-31    <NA>
Name: SNOW, Length: 83, dtype: double[pyarrow]

# 8 - Using .loc pull out the last four  months of a year

In [81]:
snow.loc[(snow.index.month >= 9) & (snow.index.year == 2001)]

2001-09-01    0.0
2001-09-02    0.0
2001-09-03    0.0
2001-09-04    0.0
2001-09-05    0.0
             ... 
2001-12-27    1.0
2001-12-28    0.0
2001-12-29    0.0
2001-12-30    2.0
2001-12-31    0.0
Name: SNOW, Length: 91, dtype: double[pyarrow]