In [1]:
#import dependencies
import os
import pandas as pd

In [2]:
#read raw data
stations_raw_df = pd.read_csv(os.path.join('resources','hawaii_stations.csv'))
meas_df = pd.read_csv(os.path.join('resources','hawaii_measurements.csv'))

In [3]:
#Find out if there are missing values in our data
print(f'Total prcp missing values: {len(meas_df[pd.isnull(meas_df["prcp"])])}')
print(f'Total station missing values: {len(meas_df[pd.isnull(meas_df["station"])])}')
print(f'Total date missing values: {len(meas_df[pd.isnull(meas_df["date"])])}')
print(f'Total tobs missing values: {len(meas_df[pd.isnull(meas_df["tobs"])])}')

Total prcp missing values: 1447
Total station missing values: 0
Total date missing values: 0
Total tobs missing values: 0


#####  We will come up with a dictionary 
```python
{
station_id_1: {jan:median, feb:median, ... }, 
station_id_2: {jan:median, feb:median, ... },
... }
```
##### to fill in missing values. The median is taken over the measurements for all the years recorded

In [4]:
#create new dataframe per station and save it in a list
stations = meas_df['station'].unique()
stations_df = [meas_df[meas_df['station'] == station] for station in stations]
stations_df[0].head(5)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
#add a column with month
for station in stations_df:
    station['month'] = station['date'].apply(lambda x: x.split('-')[1])
#add month to the measurements dataframe as well:
meas_df['month'] = meas_df['date'].apply(lambda x: x.split('-')[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#create the dictionary as defined above
station_month_to_mean_prcp = {}
for station_df in stations_df:
    station_id = station_df.iloc[0,0]
    station_mo_gp = station_df.groupby('month')
    median_mo = station_mo_gp.median()
    median_mo_dict = median_mo.loc[:,'prcp'].to_dict()
    station_month_to_mean_prcp[station_id] = median_mo_dict

In [7]:
#mean precipitaion in November at USC00513117 is 0.299
station_month_to_mean_prcp['USC00513117']['11']


0.03

In [8]:
meas_df.head(2)

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,1


In [9]:
#let's count the rows with null values
len(meas_df[pd.isnull(meas_df["prcp"])])

1447

In [10]:
#iterate over the rows with null values only and use our dictionary to update 'prcp' with the
#per-staiton, per-month median value
for index, row in meas_df[pd.isnull(meas_df["prcp"])].iterrows():
    stat = row['station']
    month = row['month']
    median_prcp = station_month_to_mean_prcp[stat][month]
    #print (stat,month, row['prcp'], median_prcp)
    meas_df.iloc[index,2] = median_prcp

In [11]:
#let's count the rows with null values after your updates
len(meas_df[pd.isnull(meas_df["prcp"])])

0

In [12]:
meas_df.head(15)

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.0,63,1
2,USC00519397,2010-01-03,0.0,74,1
3,USC00519397,2010-01-04,0.0,76,1
4,USC00519397,2010-01-06,0.0,73,1
5,USC00519397,2010-01-07,0.06,70,1
6,USC00519397,2010-01-08,0.0,64,1
7,USC00519397,2010-01-09,0.0,68,1
8,USC00519397,2010-01-10,0.0,73,1
9,USC00519397,2010-01-11,0.01,64,1


In [13]:
#save cleaned up data
stations_raw_df.to_csv(os.path.join('resources','clean_hawaii_stations.csv'),index=False)
meas_df.to_csv(os.path.join('resources','clean_hawaii_measurements.csv'),index=False)