In [1]:
import pandas as pd

In [2]:
weather_data_file = '../data/NOAA_weather_data/2005-2015_california_box_all.csv'
fire_data_file = '../data/USDA_wildfire_data/regridded_california_fires.csv'
training_data_file = '../data/training_data/2005-2015_training_data_box.csv'

# Parallelization options
N_PROCESSES = 15
JOBS_PER_PROCESS = 1

In [3]:
weather = pd.read_csv(weather_data_file)
weather['time'] = pd.to_datetime(weather['time'])
weather['date'] = weather['time'].dt.date
weather['date'] = pd.to_datetime(weather['date'])

fires = pd.read_csv(fire_data_file)
fires['ignition'] = 1
fires['date'] = pd.to_datetime(fires['date'])

In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44669040 entries, 0 to 44669039
Data columns (total 14 columns):
time        datetime64[ns]
lat         float64
lon         float64
air.sfc     float64
air.2m      float64
apcp        float64
crain       float64
rhum.2m     float64
dpt.2m      float64
pres.sfc    float64
uwnd.10m    float64
vwnd.10m    float64
veg         float64
date        datetime64[ns]
dtypes: datetime64[ns](2), float64(12)
memory usage: 4.7 GB


In [5]:
fires.head()

Unnamed: 0,lat,lon,date,time,size,size_class,ignition
0,39.93427,-121.1598,2005-02-02,1300.0,0.1,A,1
1,38.8577,-120.5208,2004-05-12,845.0,0.25,A,1
2,39.13992,-120.5874,2004-05-31,1921.0,0.1,A,1
3,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1
4,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1


In [6]:
fires.tail()

Unnamed: 0,lat,lon,date,time,size,size_class,ignition
189545,40.32918,-122.4153,2015-09-26,1726.0,0.01,A,1
189546,37.62744,-120.971,2015-10-05,126.0,0.2,A,1
189547,37.62744,-120.971,2015-05-02,2052.0,0.1,A,1
189548,37.62744,-120.971,2015-10-14,2309.0,2.0,B,1
189549,34.15059,-116.7372,2015-03-14,2128.0,0.1,A,1


In [7]:
training_data = pd.merge(weather, fires, on=['lat', 'lon', 'date'], how='left')

In [8]:
training_data.head()

Unnamed: 0,time_x,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,date,time_y,size,size_class,ignition
0,2005-01-01,31.63322,-125.3449,289.18558,287.6236,0.0,0.0,81.03484,284.37213,101686.18,7.070458,-3.266892,0.0,2005-01-01,,,,
1,2005-01-01,31.90358,-125.4247,289.18558,287.6236,0.0,0.0,81.03484,284.37213,101686.18,7.492333,-2.751267,0.0,2005-01-01,,,,
2,2005-01-01,31.70029,-125.0272,289.24808,287.6236,0.0,0.0,79.84734,284.12994,101686.18,7.117333,-2.696579,0.0,2005-01-01,,,,
3,2005-01-01,31.97103,-125.1057,288.99808,287.5611,0.0,0.0,82.65984,284.61432,101686.18,7.507958,-2.063767,0.0,2005-01-01,,,,
4,2005-01-01,32.24205,-125.1849,288.81058,287.4361,0.0,0.0,82.65984,284.4815,101686.18,7.664208,-1.610642,0.0,2005-01-01,,,,


In [9]:
training_data['air.sfc'].isnull().sum()

0

In [10]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44802584 entries, 0 to 44802583
Data columns (total 18 columns):
time_x        datetime64[ns]
lat           float64
lon           float64
air.sfc       float64
air.2m        float64
apcp          float64
crain         float64
rhum.2m       float64
dpt.2m        float64
pres.sfc      float64
uwnd.10m      float64
vwnd.10m      float64
veg           float64
date          datetime64[ns]
time_y        float64
size          float64
size_class    object
ignition      float64
dtypes: datetime64[ns](2), float64(15), object(1)
memory usage: 6.3+ GB


In [11]:
training_data.rename(columns={'time_x':'weather_bin_time','time_y':'fire_discovery_time'}, inplace=True)
training_data.drop(['date'], axis=1, inplace=True)
training_data.head()

Unnamed: 0,weather_bin_time,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,fire_discovery_time,size,size_class,ignition
0,2005-01-01,31.63322,-125.3449,289.18558,287.6236,0.0,0.0,81.03484,284.37213,101686.18,7.070458,-3.266892,0.0,,,,
1,2005-01-01,31.90358,-125.4247,289.18558,287.6236,0.0,0.0,81.03484,284.37213,101686.18,7.492333,-2.751267,0.0,,,,
2,2005-01-01,31.70029,-125.0272,289.24808,287.6236,0.0,0.0,79.84734,284.12994,101686.18,7.117333,-2.696579,0.0,,,,
3,2005-01-01,31.97103,-125.1057,288.99808,287.5611,0.0,0.0,82.65984,284.61432,101686.18,7.507958,-2.063767,0.0,,,,
4,2005-01-01,32.24205,-125.1849,288.81058,287.4361,0.0,0.0,82.65984,284.4815,101686.18,7.664208,-1.610642,0.0,,,,


In [12]:
training_data.describe()

Unnamed: 0,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,fire_discovery_time,size,ignition
count,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,44802580.0,503968.0,671296.0,671296.0
mean,37.03931,-119.3083,287.8614,286.9462,0.105617,0.05184511,59.66921,276.6226,92661.92,1.65417,-1.001125,14.22232,1388.990682,84.102874,1.0
std,3.307911,3.560928,11.96939,9.167043,0.6497919,0.2217142,29.41525,8.844924,8519.585,2.919533,4.210851,19.80744,496.087687,2512.417034,0.0
min,31.53068,-125.4445,238.766,244.6787,0.0,0.0,1.0,192.1248,66243.55,-20.30144,-20.67075,0.0,0.0,0.001,1.0
25%,34.14266,-122.395,281.0404,282.215,0.0,0.0,31.5,269.6379,84566.65,-0.3768673,-3.444094,0.0,1122.0,0.1,1.0
50%,36.93702,-119.3113,287.4534,287.0248,0.0,0.0,64.09358,277.0536,93865.06,1.595291,-0.7750001,4.2,1430.0,0.25,1.0
75%,39.8743,-116.2261,292.0649,291.1728,0.0078125,0.0,89.59193,284.5935,101453.1,3.534164,1.86144,21.3,1722.0,1.0,1.0
max,42.99324,-113.1324,338.4141,322.9702,41.97656,1.0,100.0312,304.4732,104191.6,22.68434,29.87953,83.7,2359.0,315578.8,1.0


In [None]:
training_data.to_csv(training_data_file, index=False)