In [690]:
import csv
import pandas as pd
import numpy as np
import operator

In [691]:
# Read water data
df = pd.read_csv('../data/rain_final_fixed.csv', parse_dates=['date'])
df.drop(df.columns[0], axis=1,inplace=True)
df.drop(['year'], axis=1,inplace=True)

In [692]:
df = df[(df['date']>=np.datetime64('2003-01-01'))]
df

Unnamed: 0,county,date,dailyRain
0,Alameda,2003-10-01,0.000000
1,Alameda,2003-10-02,0.000000
2,Alameda,2003-10-03,0.000000
3,Alameda,2003-10-04,0.000000
4,Alameda,2003-10-05,0.000000
...,...,...,...
290040,Yuba,2019-12-27,0.000000
290041,Yuba,2019-12-28,0.000000
290042,Yuba,2019-12-29,0.065000
290043,Yuba,2019-12-30,0.008333


In [693]:
# Filter to a specific year
year = 2019
df = df[df.date.dt.year == year]

In [694]:
# Find all counties
df_counties = pd.read_csv('../data/county_lat_long.csv')
counties = df_counties.County.unique()
df_counties.set_index(['County'], inplace=True)
print(counties)
print(len(counties))

['Alameda' 'Alpine' 'Amador' 'Butte' 'Calaveras' 'Colusa' 'Contra Costa'
 'Del Norte' 'El Dorado' 'Fresno' 'Glenn' 'Humboldt' 'Imperial' 'Inyo'
 'Kern' 'Kings' 'Lake' 'Lassen' 'Los Angeles' 'Madera' 'Marin' 'Mariposa'
 'Mendocino' 'Merced' 'Modoc' 'Mono' 'Monterey' 'Napa' 'Nevada' 'Orange'
 'Placer' 'Plumas' 'Riverside' 'Sacramento' 'San Benito' 'San Bernardino'
 'San Diego' 'San Francisco' 'San Joaquin' 'San Luis Obispo' 'San Mateo'
 'Santa Barbara' 'Santa Clara' 'Santa Cruz' 'Shasta' 'Sierra' 'Siskiyou'
 'Solano' 'Sonoma' 'Stanislaus' 'Sutter' 'Tehama' 'Trinity' 'Tulare'
 'Tuolumne' 'Ventura' 'Yolo' 'Yuba']
58


In [695]:
# Generate dataframe with rows for every day/county
all_days = pd.date_range(start=df.date.min(), end=df.date.max()).strftime('%Y-%m-%d')
mux = pd.MultiIndex.from_product((all_days, counties, [np.nan]), names=['date', 'county', 'dailyRain'])
df_final = mux.to_frame(index=False)
df_final.set_index(['date', 'county'], inplace=True)
print(df_final)

                      dailyRain
date       county              
2019-01-01 Alameda          NaN
           Alpine           NaN
           Amador           NaN
           Butte            NaN
           Calaveras        NaN
...                         ...
2019-12-31 Tulare           NaN
           Tuolumne         NaN
           Ventura          NaN
           Yolo             NaN
           Yuba             NaN

[21170 rows x 1 columns]


In [696]:
# Add known data to df_final
for index, row in df.iterrows():
    df_final.loc[row['date'].strftime('%Y-%m-%d'),row['county']] = row['dailyRain']
df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dailyRain
date,county,Unnamed: 2_level_1
2019-01-01,Alameda,0.0
2019-01-01,Alpine,0.225
2019-01-01,Amador,0.0
2019-01-01,Butte,0.0
2019-01-01,Calaveras,0.008333


In [697]:
# Count nan in daily_rain_inches
df_final.isna().sum()

dailyRain    3650
dtype: int64

In [698]:
# For all rows will all NaN, find three nearest counties and average those values
def find_nearest(c, d):
    lat = df_counties.loc[c].Latitude
    long = df_counties.loc[c].Longitude
    distances = {}
    for n in df_counties.query('County != @c').index.tolist():
        if not df.loc[(df['date'] == d) & (df['county'] == n)].empty:
            distances[n] = (df_counties.loc[n].Latitude - lat)**2+(df_counties.loc[n].Longitude - long)**2
    dist_sorted = sorted(distances.items(), key=operator.itemgetter(1))[:3]
    return list(map(lambda x: x[0], dist_sorted))
       
for date in all_days:
    for county in df_counties.index:
        if df.loc[(df['date'] == date) & (df['county'] == county)].empty:
            nearby = find_nearest(county, date)
            ave = 0
            for near in nearby:
                ave = ave + df_final.loc[date, near]
            df_final.loc[date, county] = ave / 3
print(df_final)

                      dailyRain
date       county              
2019-01-01 Alameda     0.000000
           Alpine      0.225000
           Amador      0.000000
           Butte       0.000000
           Calaveras   0.008333
...                         ...
2019-12-31 Tulare      0.004118
           Tuolumne    0.008571
           Ventura     0.000000
           Yolo        0.000000
           Yuba        0.001667

[21170 rows x 1 columns]


In [699]:
# Count nan in daily_rain_inches
df_final.isna().sum()

dailyRain    0
dtype: int64

In [700]:
# Write resulting data to csv
df_final.to_csv('../data/water' + str(year) + '.csv')