In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('precipitation.csv', parse_dates=[2])
data.columns

Index(['station', 'name', 'date', 'latitude', 'longitude', 'elevation',
       'precipitation'],
      dtype='object')

In [3]:
# add column month to precp_data
temp = pd.to_datetime(data['date'])
x, y = temp.dt.year, temp.dt.month
x, y = x.map('{:04}'.format), y.map('{:02}'.format)
x, y
temp = x + '-' + y
temp
data['month'] = temp
data.columns

Index(['station', 'name', 'date', 'latitude', 'longitude', 'elevation',
       'precipitation', 'month'],
      dtype='object')

In [4]:
# clean table -> drop cols that are not needed
data = data.drop(columns=['station', 'date', 'latitude', 'longitude', 'elevation'])
data.columns


Index(['name', 'precipitation', 'month'], dtype='object')

In [5]:
# group data by name and month 
data2 = data.groupby(['name', 'month']).aggregate('sum').reset_index()
data2

Unnamed: 0,name,month,precipitation
0,BURNABY SIMON FRASER U,2016-01,1809
1,BURNABY SIMON FRASER U,2016-02,1482
2,BURNABY SIMON FRASER U,2016-03,2159
3,BURNABY SIMON FRASER U,2016-04,374
4,BURNABY SIMON FRASER U,2016-05,606
...,...,...,...
103,YELLOWKNIFE A,2016-08,280
104,YELLOWKNIFE A,2016-09,488
105,YELLOWKNIFE A,2016-10,158
106,YELLOWKNIFE A,2016-11,176


In [6]:
# create a row for each station (name) and column for each month.
data2 = data2.pivot(index='name', columns='month', values='precipitation')
data2

month,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BURNABY SIMON FRASER U,1809,1482,2159,374,606,428,442,68,956,1429,1349,1846
CALGARY INTL A,121,22,46,40,683,616,2061,834,253,276,27,224
GANDER INTL A,896,1232,992,1106,1236,1015,1264,1096,1026,2422,1184,1099
HALIFAX INTL A,1491,1473,1452,1595,1001,725,734,445,845,2185,1305,1835
REVELSTOKE,930,561,533,204,546,748,662,274,785,1620,1293,295
SHERBROOKE,183,982,621,471,667,671,525,1599,433,1176,594,627
TORONTO LESTER B. PEARSON INT',363,491,729,681,346,269,399,667,663,452,497,821
VANCOUVER INTL A,1682,1304,1616,242,516,582,328,138,784,2034,2402,1520
YELLOWKNIFE A,220,90,28,120,98,306,154,280,488,158,176,116


In [7]:
# get the count of observations
data3 = data.groupby(['name', 'month']).aggregate('count').reset_index()
data3 = data3.pivot(index='name', columns='month', values='precipitation')
data3

month,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BURNABY SIMON FRASER U,30,22,30,30,21,21,20,25,20,19,17,16
CALGARY INTL A,31,29,31,30,31,30,31,30,30,30,30,30
GANDER INTL A,31,29,31,30,30,30,31,31,30,31,30,31
HALIFAX INTL A,31,29,31,30,31,30,31,31,29,30,30,31
REVELSTOKE,31,29,31,29,31,29,30,31,30,31,30,29
SHERBROOKE,31,29,31,30,31,30,28,31,30,30,29,31
TORONTO LESTER B. PEARSON INT',31,28,31,30,31,30,31,31,30,31,29,31
VANCOUVER INTL A,30,28,31,30,30,29,30,31,30,31,30,30
YELLOWKNIFE A,31,28,31,30,31,30,31,31,30,30,30,30


In [8]:
def get_precip_data():
    return pd.read_csv('precipitation.csv', parse_dates=[2])


def date_to_month(d):
    # You may need to modify this function, depending on your data types.
    return '%04i-%02i' % (d.year, d.month)


def pivot_months_pandas(data):
    """
    Create monthly precipitation totals for each station in the data set.
    
    This should use Pandas methods to manipulate the data.
    """
    # ...
    return monthly, counts


def pivot_months_loops(data):
    """
    Create monthly precipitation totals for each station in the data set.
    
    This does it the hard way: using Pandas as a dumb data store, and iterating in Python.
    """
    # Find all stations and months in the data set.
    stations = set()
    months = set()
    for i,r in data.iterrows():
        stations.add(r['name'])
        m = date_to_month(r['date'])
        months.add(m)

    # Aggregate into dictionaries so we can look up later.
    stations = sorted(list(stations))
    row_to_station = dict(enumerate(stations))
    station_to_row = {s: i for i,s in row_to_station.items()}
    
    months = sorted(list(months))
    col_to_month = dict(enumerate(months))
    month_to_col = {m: i for i,m in col_to_month.items()}

    # Create arrays for the data, and fill them.
    precip_total = np.zeros((len(row_to_station), 12), dtype=np.uint)
    obs_count = np.zeros((len(row_to_station), 12), dtype=np.uint)

    for _, row in data.iterrows():
        m = date_to_month(row['date'])
        r = station_to_row[row['name']]
        c = month_to_col[m]

        precip_total[r, c] += row['precipitation']
        obs_count[r, c] += 1

    # Build the DataFrames we needed all along (tidying up the index names while we're at it).
    totals = pd.DataFrame(
        data=precip_total,
        index=stations,
        columns=months,
    )
    totals.index.name = 'name'
    totals.columns.name = 'month'
    
    counts = pd.DataFrame(
        data=obs_count,
        index=stations,
        columns=months,
    )
    counts.index.name = 'name'
    counts.columns.name = 'month'
    
    return totals, counts

 
def main():
    data = get_precip_data()
    totals, counts = pivot_months_loops(data)
    totals.to_csv('totals.csv')
    counts.to_csv('counts.csv')
    np.savez('monthdata.npz', totals=totals.values, counts=counts.values)


if __name__ == '__main__':
    main()
