### Goal ###
Find the nearest [California Data Exchange Center](https://cdec.water.ca.gov/) weather station to each of our California bins. This information will be used to assign each bin weather variable values.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
%matplotlib inline
%load_ext memory_profiler

Since we have many bins which do not contain a weather monitoring station, the next step is to fill in their values from the nearest station. First find the nearest station to each bin. To do this we will use scipy.spatial.

In [2]:
bins = pd.read_csv('../data/spatial_data/california_bins.csv')
stations = pd.read_csv('../data/CDEC_weather_station_data/target_stations.csv')
stations.columns = ['station','elevation', 'lat', 'long']
stations = stations[stations.long != 0]
bin_array = np.column_stack([bins['long'], bins['lat']])
station_array = np.column_stack([stations['long'], stations['lat']])

In [3]:
from scipy import spatial

station_tree = spatial.cKDTree(station_array)
dist, indexes = station_tree.query(bin_array)

nearest_station_names = []
for index in indexes:
    nearest_station_names.append(stations.iloc[index, 0])
    
bins['nearest_station_name'] = nearest_station_names

Starting with the temperature dataset as a test case, now we will make an hourly time series spanning from 2006 to 2016 and assign each bin a temprature from it's nearest station each hour. 

In [4]:
# Annoying, but here goes...
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

temp_data = pd.read_csv("../data/weather_data/TEMP_1d.csv", parse_dates = ['OBS_DATE'], usecols = ["STATION_ID", "OBS_DATE", "VALUE"], index_col = "OBS_DATE")
# temp_data = pd.read_csv("../data/weather_data/TEMP_1d.csv")
temp_data = temp_data[temp_data.VALUE != '---']
#temp_data = temp_data.drop_duplicates()
#temp_data = temp_data.drop_duplicates()
temp_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5622044 entries, 2015-01-01 23:00:00 to 2016-01-01 23:00:00
Data columns (total 2 columns):
STATION_ID    object
VALUE         int64
dtypes: int64(1), object(1)
memory usage: 128.7+ MB


Here is an example of a station which does not report regularly on the hour. I am sure there are many other cases of this in the dataset. There are also almost certainly missing values. To fix this, we will resample and interpolate to a regular hourly frequency over the whole dataset.

In [5]:
bud_station = temp_data[temp_data['STATION_ID'] == 'BUD']
bud_station.head()

Unnamed: 0_level_0,STATION_ID,VALUE
OBS_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 23:00:00,BUD,33
2015-01-01 23:15:00,BUD,33
2015-01-01 23:30:00,BUD,33
2015-01-01 23:45:00,BUD,33
2015-01-02 00:00:00,BUD,33


In [6]:
#bud_station = bud_station.drop_duplicates()
bud_station_min = bud_station.resample('min')
bud_station_min = bud_station_min.interpolate(method = 'linear')
bud_station_hr = bud_station_min.resample('H')
bud_station_hr = bud_station_hr.interpolate(method = 'linear')
bud_station_hr['STATION_ID'] = 'BUD'
bud_station_hr.head()

Unnamed: 0_level_0,STATION_ID,VALUE
OBS_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 23:00:00,BUD,33.0
2015-01-02 00:00:00,BUD,33.0
2015-01-02 01:00:00,BUD,33.0
2015-01-02 02:00:00,BUD,32.0
2015-01-02 03:00:00,BUD,32.0


In [7]:
# def regularize(group):
#     #group = group.drop_duplicates()
#     group = group.loc[~group.index.duplicated()]
#     group = group.resample('min')
#     group = group.interpolate(method = 'linear')
#     group = group.resample('H')
#     group = group.interpolate(method = 'linear')
#     return group
    
# grouped_temp_data = temp_data.groupby('STATION_ID')
# st_regularized_temp_data = grouped_temp_data.apply(regularize)

# st_regularized_temp_data['STATION_ID'] = st_regularized_temp_data.index.get_level_values(0)
# st_regularized_temp_data = st_regularized_temp_data.reset_index(level = 0, drop = True)
# st_regularized_temp_data = st_regularized_temp_data.set_index(['STATION_ID'], append = True)
# st_regularized_temp_data['VALUE'].replace('', np.nan, inplace = True)
# st_regularized_temp_data.dropna(subset = ['VALUE'], inplace = True)
# st_regularized_temp_data = st_regularized_temp_data.loc[~st_regularized_temp_data.index.duplicated()]
# st_regularized_temp_data.info()

Parallelize it?!

In [8]:
n_threads = 14

def regularize(group):
    #group = group.drop_duplicates()
    group = group.loc[~group.index.duplicated()]
    group = group.resample('min')
    group = group.interpolate(method = 'linear')
    group = group.resample('H')
    group = group.interpolate(method = 'linear')
    #print(group)
    return group

def group_data(temp_data_split):
    grouped_data = temp_data_split.groupby('STATION_ID')
    return grouped_data.apply(regularize)
    
def parallelize(temp_data, func, n_cores = n_threads):
    '''Parallelizes regularization, takes temp data and
    splits up regularization fuction over avalibile threads'''
    temp_data_split = np.array_split(temp_data, n_cores)
    pool = Pool(n_cores)
    result = pd.concat(pool.map(func, temp_data_split))
    pool.close()
    pool.join()
    return result
    
%timeit mt_regularized_temp_data = parallelize(temp_data, group_data)

mt_regularized_temp_data['STATION_ID'] = mt_regularized_temp_data.index.get_level_values(0)
mt_regularized_temp_data = mt_regularized_temp_data.reset_index(level = 0, drop = True)
mt_regularized_temp_data = mt_regularized_temp_data.set_index(['STATION_ID'], append = True)
mt_regularized_temp_data['VALUE'].replace('', np.nan, inplace = True)
mt_regularized_temp_data.dropna(subset = ['VALUE'], inplace = True)
mt_regularized_temp_data = mt_regularized_temp_data.loc[~mt_regularized_temp_data.index.duplicated()]
mt_regularized_temp_data.info()

Process ForkPoolWorker-9:
Traceback (most recent call last):
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-8-62852d4194c3>", line 15, in group_data
    return grouped_data.apply(regularize)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 725, in apply
    result = self._python_apply_general(f)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/site-packages/

KeyboardInterrupt: 

  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-8-62852d4194c3>", line 15, in group_data
    return grouped_data.apply(regularize)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 725, in apply
    result = self._python_apply_general(f)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 742, in _python_apply_general
    keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
  File "/home/siderealyear/anaconda3/envs/wildfire/lib/python3.6/site-packages/pandas/core/groupby/ops.py", line 237, in apply
    res = f(group)
  File "<ipython-input-8-62852d4194c3>", line 7, in reg

This next big block uses resampling to bin the temp. data at a resolution of six hours. In early runs I had memory issues so I added this. Hoping I won't need it in the final product. Keeping the code here I case I want to use it again.

In [None]:
# n_threads = 14

# def downsample_timeseries(group):
#     group = group.sort_index()
#     group.loc[:,'resampled_value'] = group.VALUE.rolling('6H').mean()
#     group = group.drop(['VALUE'], axis = 1)
#     return group.iloc[0::12, :]

# def group_timeseries_data(stations):
#     data = temp_data.loc[temp_data['STATION_ID'].isin(stations)]
#     grouped_data = data.groupby('STATION_ID')
#     return grouped_data.apply(downsample_timeseries)
    
# def parallelize(stations, func, n_cores = n_threads):
#     '''Parallelizes downsampling, takes list of stations and
#     splits up the downsampling fuction over avalibile threads'''
#     stations_split = np.array_split(stations, n_cores)
#     pool = Pool(n_cores)
#     result = pd.concat(pool.map(func, stations_split))
#     pool.close()
#     pool.join()
#     return result
    
# stations = temp_data['STATION_ID'].unique().tolist()
# binned_temp_data = parallelize(stations, group_timeseries_data)
# binned_temp_data = binned_temp_data.reset_index(level = 0, drop = True)
# binned_temp_data = binned_temp_data.set_index(['STATION_ID'], append = True)
# binned_temp_data.head()

In [None]:
#binned_temp_data.to_csv('../data/training_data/weather_data/TEMP_1yr_binned_6hr.csv')

In [None]:
time_series = pd.date_range("2015-01-01 23:00:00", "2015-01-02 23:00:00", freq = "H")
time_series = time_series.to_series()

Now we make a dataframe of 'noxels' or n-dimentional voxels. Each row will be a bin at a specific time. Once we have this dataframe, we can go back though and assign weather variable values to each noxel.

In [None]:
def apply_to_bins(time_series, bins):
    return bins.assign(time = time_series)

noxels = pd.concat(time_series.apply(apply_to_bins, args = (bins,)).tolist())
noxels.to_csv('../data/noxels.csv', index = False)
noxels.info()

In [None]:
noxels.head()

In [None]:
mt_regularized_temp_data.at[('2015-01-01 23:00:00', 'ACN'), 'VALUE']

In [None]:
n_threads = 14

def try_except(row):
    try:
        return mt_regularized_temp_data.loc[(row['time'], row['nearest_station_name']), 'VALUE']
    except:
        return np.nan
    
def add_data(noxels):
    noxels['temp'] = noxels.apply(lambda row: try_except(row), axis = 1)
    return noxels

def parallelize(noxels, func, n_cores = n_threads):
    '''Parallelizes downsampling, takes list of stations and
    splits up the downsampling fuction over avalibile threads'''
    noxels_split = np.array_split(noxels, n_cores)
    pool = Pool(n_cores)
    result = pd.concat(pool.map(func, noxels_split))
    pool.close()
    pool.join()
    return result

%timeit noxels = parallelize(noxels, add_data)

noxels = noxels.dropna()
noxels.to_csv('../data/noxels_TEMP.csv', index = False)
noxels.head()

In [None]:
noxels.info()

OK, so, working to a first approximation. I am worried that we a loosing a bunch of data somewhere. Lets check some lengths and see. If we filled every bin with a temperature value, then we should have bins x time_series rows in our data file...

In [None]:
(len(noxels) / (len(bins) * len(time_series))) * 100