In [258]:
import csv
import pandas as pd
import numpy as np
import operator

In [259]:
# Read water data
df = pd.read_csv('../data/water_aggregated.csv', parse_dates=['date'])
df.drop(df.columns[0], axis=1,inplace=True)

In [260]:
df = df[(df['date']>=np.datetime64('2003-01-01'))]
df

Unnamed: 0,date,county,cumulative_rain_inches,daily_rain_inches
73279,2003-01-01,Alpine,37.14,0.88
73280,2003-01-01,Amador,26.27,0.12
73281,2003-01-01,Colusa,10.84,0.00
73282,2003-01-01,El Dorado,21.06,0.00
73283,2003-01-01,Fresno,16.93,0.00
...,...,...,...,...
378445,2020-11-10,Trinity,0.00,0.00
378446,2020-11-10,Tulare,0.79,0.00
378447,2020-11-10,Tuolumne,0.00,0.00
378448,2020-11-10,Ventura,0.00,0.00


In [261]:
# Filter to a specific year
year = 2015
df = df[df.date.dt.year == year]

In [262]:
# Find all counties
df_counties = pd.read_csv('../data/county_lat_long.csv')
counties = df_counties.County.unique()
df_counties.set_index(['County'], inplace=True)
print(counties)
print(len(counties))

['Alameda' 'Alpine' 'Amador' 'Butte' 'Calaveras' 'Colusa' 'Contra Costa'
 'Del Norte' 'El Dorado' 'Fresno' 'Glenn' 'Humboldt' 'Imperial' 'Inyo'
 'Kern' 'Kings' 'Lake' 'Lassen' 'Los Angeles' 'Madera' 'Marin' 'Mariposa'
 'Mendocino' 'Merced' 'Modoc' 'Mono' 'Monterey' 'Napa' 'Nevada' 'Orange'
 'Placer' 'Plumas' 'Riverside' 'Sacramento' 'San Benito' 'San Bernardino'
 'San Diego' 'San Francisco' 'San Joaquin' 'San Luis Obispo' 'San Mateo'
 'Santa Barbara' 'Santa Clara' 'Santa Cruz' 'Shasta' 'Sierra' 'Siskiyou'
 'Solano' 'Sonoma' 'Stanislaus' 'Sutter' 'Tehama' 'Trinity' 'Tulare'
 'Tuolumne' 'Ventura' 'Yolo' 'Yuba']
58


In [263]:
# Generate dataframe with rows for every day/county
all_days = pd.date_range(start=df.date.min(), end=df.date.max()).strftime('%Y-%m-%d')
mux = pd.MultiIndex.from_product((all_days, counties, [np.nan]), names=['date', 'county', 'daily_rain_inches'])
df_final = mux.to_frame(index=False)
df_final.set_index(['date', 'county'], inplace=True)
print(df_final)

                      daily_rain_inches
date       county                      
2015-01-01 Alameda                  NaN
           Alpine                   NaN
           Amador                   NaN
           Butte                    NaN
           Calaveras                NaN
...                                 ...
2015-12-31 Tulare                   NaN
           Tuolumne                 NaN
           Ventura                  NaN
           Yolo                     NaN
           Yuba                     NaN

[21170 rows x 1 columns]


In [264]:
# Add known data to df_final
for index, row in df.iterrows():
    df_final.loc[row['date'].strftime('%Y-%m-%d'),row['county']] = row['daily_rain_inches']
df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,daily_rain_inches
date,county,Unnamed: 2_level_1
2015-01-01,Alameda,0.0
2015-01-01,Alpine,0.3
2015-01-01,Amador,0.0
2015-01-01,Butte,0.07
2015-01-01,Calaveras,0.0


In [265]:
# Count nan in daily_rain_inches
df_final.isna().sum()

daily_rain_inches    2920
dtype: int64

In [None]:
# For all rows will all NaN, find three nearest counties and average those values
def find_nearest(c, d):
    lat = df_counties.loc[c].Latitude
    long = df_counties.loc[c].Longitude
    distances = {}
    for n in df_counties.query('County != @c').index.tolist():
        if not df.loc[(df['date'] == d) & (df['county'] == n)].empty:
            distances[n] = (df_counties.loc[n].Latitude - lat)**2+(df_counties.loc[n].Longitude - long)**2
    dist_sorted = sorted(distances.items(), key=operator.itemgetter(1))[:3]
    return list(map(lambda x: x[0], dist_sorted))
       
for date in all_days:
    for county in df_counties.index:
        if df.loc[(df['date'] == date) & (df['county'] == county)].empty:
            nearby = find_nearest(county, date)
            ave = 0
            for near in nearby:
                ave = ave + df_final.loc[date, near]
            df_final.loc[date, county] = ave / 3
print(df_final)

In [None]:
# Count nan in daily_rain_inches
df_final.isna().sum()

In [None]:
# Write resulting data to csv
df_final.to_csv('../data/water' + str(year) + '.csv')