### Pandas Review and Practice

In [3]:
import pandas as pd

#### Constants

In [18]:
DATA_URL_FMT = (
    'http://mesonet.agron.iastate.edu/'
    'cgi-bin/request/daily.py?'
    'network=IL_ASOS&stations={0}&'
    'year1=2014&month1=1&day1=1&year2=2018&month2=1&day2=1'
)

STATIONS = ['CMI', 'DEC', 'MDW', 'ORD']

#### Data Prep

In [41]:
df_list = []
for station in STATIONS:
    data_url = DATA_URL_FMT.format(station)
    df = pd.read_csv(data_url, index_col='day', parse_dates=True)
    df.iloc[:, 1:] = df[df.columns[1:]].apply(
        pd.to_numeric, errors='coerce').fillna(0)
    df.index.name = 'date'
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['FUN'] = 'FUN'
    df['SOFUN'] = 'SOFUN'
    df['ANDREW'] = 'ISCOOL'
    df_list.append(df)
df = pd.concat(df_list)

#### Column Creation

In [42]:
# Take the cumulative sum of precipitation and create new column named precip_cumsum_in
df['precip_cumsum_in'] = df['precip_in'].cumsum()
df.head()

# Same as previous
df = df.assign(precp_cumsum_in=df['precip_in'].cumsum())
df.head()

# Add the max temperature and min temperature and divide by 2 and create new column named avg_temp_f


Unnamed: 0_level_0,station,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,month,FUN,SOFUN,ANDREW,precip_cumsum_in
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2014-01-01,CMI,35.0,20.0,24.98,15.98,0.04,8.86392,43.5987,69.17,82.2646,92.6258,33.3,18.2,0.09,2014,1,FUN,SOFUN,ISCOOL,0.04
2014-01-02,CMI,20.0,-2.0,15.8,-5.98,0.0001,14.2868,340.367,73.1251,80.6299,85.6275,33.2,18.1,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401
2014-01-03,CMI,20.0,-10.0,12.92,-11.92,0.0,10.6525,170.324,64.6522,75.8618,91.086,33.2,18.0,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401
2014-01-04,CMI,36.0,20.0,30.02,12.92,0.0,15.2687,192.632,66.3567,72.2082,88.3717,33.2,17.8,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401
2014-01-05,CMI,33.0,-1.0,32.0,-7.6,0.17,18.6971,338.233,70.8526,85.0,100.0,33.1,17.7,0.09,2014,1,FUN,SOFUN,ISCOOL,0.2101


Unnamed: 0_level_0,station,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,...,climo_high_f,climo_low_f,climo_precip_in,year,month,FUN,SOFUN,ANDREW,precip_cumsum_in,precp_cumsum_in
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,CMI,35.0,20.0,24.98,15.98,0.04,8.86392,43.5987,69.17,82.2646,...,33.3,18.2,0.09,2014,1,FUN,SOFUN,ISCOOL,0.04,0.04
2014-01-02,CMI,20.0,-2.0,15.8,-5.98,0.0001,14.2868,340.367,73.1251,80.6299,...,33.2,18.1,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401,0.0401
2014-01-03,CMI,20.0,-10.0,12.92,-11.92,0.0,10.6525,170.324,64.6522,75.8618,...,33.2,18.0,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401,0.0401
2014-01-04,CMI,36.0,20.0,30.02,12.92,0.0,15.2687,192.632,66.3567,72.2082,...,33.2,17.8,0.09,2014,1,FUN,SOFUN,ISCOOL,0.0401,0.0401
2014-01-05,CMI,33.0,-1.0,32.0,-7.6,0.17,18.6971,338.233,70.8526,85.0,...,33.1,17.7,0.09,2014,1,FUN,SOFUN,ISCOOL,0.2101,0.2101


#### Column Deletion

In [43]:
# Drop "FUN" column
df = df.drop('FUN', axis=1)
df.head()

# Drop "SOFUN" column



Unnamed: 0_level_0,station,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,month,SOFUN,ANDREW,precip_cumsum_in,precp_cumsum_in
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2014-01-01,CMI,35.0,20.0,24.98,15.98,0.04,8.86392,43.5987,69.17,82.2646,92.6258,33.3,18.2,0.09,2014,1,SOFUN,ISCOOL,0.04,0.04
2014-01-02,CMI,20.0,-2.0,15.8,-5.98,0.0001,14.2868,340.367,73.1251,80.6299,85.6275,33.2,18.1,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-03,CMI,20.0,-10.0,12.92,-11.92,0.0,10.6525,170.324,64.6522,75.8618,91.086,33.2,18.0,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-04,CMI,36.0,20.0,30.02,12.92,0.0,15.2687,192.632,66.3567,72.2082,88.3717,33.2,17.8,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-05,CMI,33.0,-1.0,32.0,-7.6,0.17,18.6971,338.233,70.8526,85.0,100.0,33.1,17.7,0.09,2014,1,SOFUN,ISCOOL,0.2101,0.2101


#### Renaming Column

In [44]:
# Rename "ANDREW" column to "PYTHON"
df = df.rename(columns={'ANDREW': 'PYTHON'})
df.head()

# Rename "PYTHON" column to your own name


Unnamed: 0_level_0,station,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,month,SOFUN,PYTHON,precip_cumsum_in,precp_cumsum_in
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2014-01-01,CMI,35.0,20.0,24.98,15.98,0.04,8.86392,43.5987,69.17,82.2646,92.6258,33.3,18.2,0.09,2014,1,SOFUN,ISCOOL,0.04,0.04
2014-01-02,CMI,20.0,-2.0,15.8,-5.98,0.0001,14.2868,340.367,73.1251,80.6299,85.6275,33.2,18.1,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-03,CMI,20.0,-10.0,12.92,-11.92,0.0,10.6525,170.324,64.6522,75.8618,91.086,33.2,18.0,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-04,CMI,36.0,20.0,30.02,12.92,0.0,15.2687,192.632,66.3567,72.2082,88.3717,33.2,17.8,0.09,2014,1,SOFUN,ISCOOL,0.0401,0.0401
2014-01-05,CMI,33.0,-1.0,32.0,-7.6,0.17,18.6971,338.233,70.8526,85.0,100.0,33.1,17.7,0.09,2014,1,SOFUN,ISCOOL,0.2101,0.2101


#### Basic Subselection

In [22]:
# Subselect just tmax column
df_tmax = df['max_temp_f']
df_tmax.head()

# Subselect tmax and tmin columns
df_temp = df[['max_temp_f', 'min_temp_f']]
df_temp.head()

# Subselect tmax and tmin columns (same as previous)
temp_cols = ['max_temp_f', 'min_temp_f']
df_temp = df[temp_cols]
df_temp.tail()

# Subselect just precip column


# Subselect max dewpoint and min dewpoint columns


# Subselect all, but last two columns
cols = df.columns
df_vars = df[cols[:-2]]
df_vars.head()

# Subselect all, but first two columns


# Subselect every other column



date
2014-01-01    35.0
2014-01-02    20.0
2014-01-03    20.0
2014-01-04    36.0
2014-01-05    33.0
Name: max_temp_f, dtype: float64

Unnamed: 0_level_0,max_temp_f,min_temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01,35.0,20.0
2014-01-02,20.0,-2.0
2014-01-03,20.0,-10.0
2014-01-04,36.0,20.0
2014-01-05,33.0,-1.0


Unnamed: 0_level_0,max_temp_f,min_temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-27,7.0,-4.0
2017-12-28,15.0,0.0
2017-12-29,15.0,9.0
2017-12-30,15.0,-1.0
2017-12-31,13.0,-1.0


Unnamed: 0_level_0,station,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-01-01,CMI,35.0,20.0,24.98,15.98,0.04,8.86392,43.5987,69.17,82.2646,92.6258,33.3,18.2,0.09
2014-01-02,CMI,20.0,-2.0,15.8,-5.98,0.0001,14.2868,340.367,73.1251,80.6299,85.6275,33.2,18.1,0.09
2014-01-03,CMI,20.0,-10.0,12.92,-11.92,0.0,10.6525,170.324,64.6522,75.8618,91.086,33.2,18.0,0.09
2014-01-04,CMI,36.0,20.0,30.02,12.92,0.0,15.2687,192.632,66.3567,72.2082,88.3717,33.2,17.8,0.09
2014-01-05,CMI,33.0,-1.0,32.0,-7.6,0.17,18.6971,338.233,70.8526,85.0,100.0,33.1,17.7,0.09


#### Advanced Index Subselection https://stackoverflow.com/questions/28757389/loc-vs-iloc-vs-ix-vs-at-vs-iat

In [21]:
# Subselect just tmax column with .loc[]
df_tmax = df.loc[:, 'max_temp_f']
df_tmax.head()

# Subselect from 2014-01-01 to 2014-01-03 of tmax and tmin columns with .loc[]
df_temp = df.loc['2014-01-01':'2014-01-03', ['max_temp_f', 'min_temp_f']]
df_temp.head()

# # Subselect from 2014-01-01 to 2014-01-03  of tmax and tmin columns (same as previous)
start_end = slice('2014-01-01', '2014-01-03')
temp_cols = ['max_temp_f', 'min_temp_f']
df_temp = df.loc[start_end, temp_cols]
df_temp.tail()

# Subselect just precip column with .loc[]


# Subselect from 2015-01-01 to 2016-01-03 max dewpoint and min dewpoint columns with .loc[]


# Subselect all, but last two columns with .loc[]


# Subselect all, but first two columns with .loc[]


# Subselect every other column with .loc[]



date
2014-01-01    35.0
2014-01-02    20.0
2014-01-03    20.0
2014-01-04    36.0
2014-01-05    33.0
Name: max_temp_f, dtype: float64

Unnamed: 0_level_0,max_temp_f,min_temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01,35.0,20.0
2014-01-02,20.0,-2.0
2014-01-03,20.0,-10.0
2014-01-01,37.0,18.0
2014-01-02,20.0,5.0


Unnamed: 0_level_0,max_temp_f,min_temp_f
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-02,22.0,5.0
2014-01-03,20.0,-2.0
2014-01-01,22.0,10.0
2014-01-02,21.0,-1.0
2014-01-03,20.0,-12.0


#### Advanced Position Subselection https://stackoverflow.com/questions/28757389/loc-vs-iloc-vs-ix-vs-at-vs-iat

In [27]:
# Subselect the first column with .iloc[]
df_col1 = df.iloc[:, 0]
df_col1.head()

# Subselect from first ten of tmax and tmin columns with .iloc[]
df_temp = df.iloc[range(0, 10), [1, 2]]
df_temp.shape

# Subselect just precip column with .iloc[]


# Subselect every other row of max dewpoint and min dewpoint columns with .iloc[]


# Subselect all, but last two columns with .iloc[]


# Subselect all, but first two columns AND last two columns with .iloc[]


# Subselect every other column with .iloc[]



date
2014-01-01    CMI
2014-01-02    CMI
2014-01-03    CMI
2014-01-04    CMI
2014-01-05    CMI
Name: station, dtype: object

(10, 2)

#### Resampling timeseries

In [12]:
# Resample to every two months by taking the average
df_2ms_avg = df.resample('2MS').mean()
df_2ms_avg.head()

# Resample it to every three months by taking the median

# Resample it to every year by taking the max

# Resample it to every other day (2 days) by taking the min

Unnamed: 0_level_0,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01-01,27.343644,9.602458,18.34822,2.649322,0.063631,10.38109,225.534332,57.189051,69.729741,83.055706,34.815678,19.558051,0.064873,2014.0,1.474576
2014-03-01,52.92623,32.061475,37.042951,22.702705,0.082801,10.25977,180.752643,44.035422,62.991627,82.64945,55.063934,35.497951,0.103361,2014.0,3.491803
2014-05-01,78.29541,57.88123,60.080738,49.509426,0.187757,8.722354,166.078168,47.00331,65.886184,85.290417,76.804508,55.492213,0.133934,2014.0,5.491803
2014-07-01,81.484113,63.447581,66.256613,57.0425,0.157109,6.829905,182.122963,52.092471,70.914384,88.395409,83.78871,64.200403,0.129274,2014.0,7.5
2014-09-01,69.102541,49.508279,53.795246,43.456557,0.11075,7.439116,194.999,50.45742,70.945068,89.788419,70.285246,48.979098,0.104508,2014.0,9.508197


#### Groupby Operations

In [57]:
# Groupby station and take the max of each column
df_station_max = df.groupby('station').max()
df_station_max.head()

# Groupby station and month and take the average of each column
df_station_monthly_avg = df.groupby(['station', 'month']).mean()
# if you don't want to deal with multilevel indices
df_station_monthly_avg = df_station_monthly_avg.reset_index()
df_station_monthly_avg[::10]

# Groupby max_temp_f and take the min of each column


# Groupby station and year and take the average of each column



Unnamed: 0_level_0,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,month,SOFUN,PYTHON,precip_cumsum_in,precp_cumsum_in
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
CMI,99.0,73.0,81.0,72.0,3.76,26.3971,359.101,92.9613,97.7021,100.0,85.0,64.3,0.19,2017,12,SOFUN,ISCOOL,152.5025,152.5025
DEC,96.0,76.0,80.06,75.0,2.9,22.705,359.994,100.0,100.0,100.0,87.0,65.7,0.23,2017,12,SOFUN,ISCOOL,297.063,297.063
MDW,95.0,79.0,78.98,75.0,5.2,21.4438,359.761,92.7556,97.9233,100.0,84.6,67.8,0.15,2017,12,SOFUN,ISCOOL,453.3608,453.3608
ORD,95.0,77.0,78.08,74.0,4.19,23.1027,359.965,92.3243,96.1199,100.0,84.5,64.3,0.18,2017,12,SOFUN,ISCOOL,612.5748,612.5748


Unnamed: 0,station,month,max_temp_f,min_temp_f,max_dewpoint_f,min_dewpoint_f,precip_in,avg_wind_speed_kts,avg_wind_drct,min_rh,avg_rh,max_rh,climo_high_f,climo_low_f,climo_precip_in,year,precip_cumsum_in,precp_cumsum_in
0,CMI,1,34.370968,17.58871,27.393871,12.730484,0.043012,11.10907,208.832208,66.404603,78.004128,88.818773,33.496774,17.196774,0.066129,2015.5,61.763265,61.763265
10,CMI,11,52.65,32.65,40.321333,27.691333,0.097347,10.343254,195.771679,55.309983,74.045947,88.659577,51.106667,32.1,0.116667,2015.5,94.914662,94.914662
20,DEC,9,81.4,57.0,61.867,52.590833,0.078677,6.093748,175.153274,44.341262,69.498555,90.638963,78.596667,54.303333,0.096333,2015.5,236.287291,236.287291
30,MDW,7,83.467742,66.870968,66.446452,56.386129,0.136782,7.380304,155.40124,46.543124,63.676531,83.332735,84.196774,67.503226,0.129355,2015.5,375.904685,375.904685
40,ORD,5,69.694194,49.306613,51.718065,39.203871,0.141868,8.842037,150.088931,43.964123,62.768245,84.430806,70.0,48.303226,0.11871,2015.5,524.538791,524.538791
