In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
from multiprocessing import Pool

In [2]:
weather_data_file = '../data/NOAA_weather_data/2015_california_only_all.csv'
fire_data_file = '../data/USDA_wildfire_data/california_fires.csv'

# Parallelization options
N_PROCESSES = 15
JOBS_PER_PROCESS = 1

In [3]:
weather = pd.read_csv(weather_data_file)
weather['time'] = pd.to_datetime(weather['time'])
#weather.set_index('time', drop=True, inplace=True)
weather['ignition'] = 0
weather = weather.sort_index()
fires = pd.read_csv(fire_data_file)
sample_weather = weather[weather['time'] == '2015-12-31']
sample_fires = fires[fires['date'] == '2015-12-31']

In [10]:
def bin_fire_ignitions(fire):
    date = fire['date']
    try:
        day = weather[weather['time'] == date]
        if len(day) > 0:
            bin_location_array = np.column_stack([day['lon'], day['lat']])
            fire_location = np.column_stack([fire['lon'], fire['lat']])

            bin_tree = spatial.cKDTree(bin_location_array)
            dist, index = bin_tree.query(fire_location)

            return(pd.Series([date, day.iloc[index[0], 1], day.iat[index[0], 2]]))
        else:
            return(pd.Series([np.nan, np.nan, np.nan]))
        
    except KeyError:
        return(pd.Series([np.nan, np.nan, np.nan]))
    
def parallel_apply(df):
    result = df.apply(bin_fire_ignitions, axis=1)
    if len(result) > 0:
        return(result)
    
    else:
        return(pd.Series([np.nan, np.nan, np.nan]))
    
def parallelize(function, df, n_processes, jobs_per_process):
    '''Parallelizes a function. Takes function name, dataframe
    and number of threads. Splits up function call over
    avalible threads. Joins and returns the results.'''  
    df_split = np.array_split(df, (n_processes * jobs_per_process))
    with Pool(n_processes) as pool:
        result = pd.concat(pool.map(function, df_split))
                                           
    pool.close()
    pool.join()
    return result

result = parallelize(parallel_apply, fires, N_PROCESSES, JOBS_PER_PROCESS)
#result = sample_fires.apply(bin_fire_ignitions, axis=1)

cleaned_result = result
cleaned_result.dropna(inplace = True)
cleaned_result.reset_index(drop = True, inplace = True)
cleaned_result.columns = ['date', 'lat', 'lon']
cleaned_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7365 entries, 0 to 7364
Data columns (total 3 columns):
date    7365 non-null object
lat     7365 non-null float64
lon     7365 non-null float64
dtypes: float64(2), object(1)
memory usage: 172.7+ KB


In [17]:
cleaned_result.head()

Unnamed: 0,date,lat,lon
0,2015-07-08,38.6755,-119.7325
1,2015-08-12,38.15829,-119.2505
2,2015-06-29,38.39312,-119.6706
3,2015-06-15,38.39312,-119.6706
4,2015-06-27,38.39312,-119.6706


In [None]:
for index, row in cleaned_result.iterrows():
    date = row['date']
    lat = row['lat']
    lon = row['lon']
    weather.loc[(weather['time'] == date) & (weather['lat'] == lat) & (weather['lon'] == lon), 'ignition'] = 1

In [26]:
weather.loc[(weather['time'] == '2015-08-12') & (weather['lat'] == 38.15829) & (weather['lon'] == -119.2505), 'ignition']

731694    0
Name: ignition, dtype: int64