In [1]:
import pandas as pd

In [2]:
weather_data_file = '../data/NOAA_weather_data/1992-1997_california_box_all.csv'
fire_data_file = '../data/USDA_wildfire_data/regridded_california_fires.csv'
training_data_file = '../data/training_data/1992-1997_training_data_box.csv'

# Parallelization options
N_PROCESSES = 15
JOBS_PER_PROCESS = 1

In [3]:
weather = pd.read_csv(weather_data_file)
weather['time'] = pd.to_datetime(weather['time'])
weather['date'] = weather['time'].dt.date
weather['date'] = pd.to_datetime(weather['date'])

fires = pd.read_csv(fire_data_file)
fires['ignition'] = 1
fires['date'] = pd.to_datetime(fires['date'])

In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24375040 entries, 0 to 24375039
Data columns (total 23 columns):
time         datetime64[ns]
lat          float64
lon          float64
air.sfc      float64
air.2m       float64
apcp         float64
crain        float64
rhum.2m      float64
dpt.2m       float64
pres.sfc     float64
uwnd.10m     float64
vwnd.10m     float64
veg          float64
dlwrf        float64
dswrf        float64
lcdc         float64
hcdc         float64
mcdc         float64
hpbl         float64
prate        float64
vis          float64
ulwrf.sfc    float64
date         datetime64[ns]
dtypes: datetime64[ns](2), float64(21)
memory usage: 4.2 GB


In [5]:
fires.head()

Unnamed: 0,lat,lon,date,time,size,size_class,ignition
0,39.93427,-121.1598,2005-02-02,1300.0,0.1,A,1
1,38.8577,-120.5208,2004-05-12,845.0,0.25,A,1
2,39.13992,-120.5874,2004-05-31,1921.0,0.1,A,1
3,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1
4,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1


In [6]:
fires.tail()

Unnamed: 0,lat,lon,date,time,size,size_class,ignition
189545,40.32918,-122.4153,2015-09-26,1726.0,0.01,A,1
189546,37.62744,-120.971,2015-10-05,126.0,0.2,A,1
189547,37.62744,-120.971,2015-05-02,2052.0,0.1,A,1
189548,37.62744,-120.971,2015-10-14,2309.0,2.0,B,1
189549,34.15059,-116.7372,2015-03-14,2128.0,0.1,A,1


In [7]:
training_data = pd.merge(weather, fires, on=['lat', 'lon', 'date'], how='left')

In [8]:
training_data.head()

Unnamed: 0,time_x,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,...,mcdc,hpbl,prate,vis,ulwrf.sfc,date,time_y,size,size_class,ignition
0,1992-01-01,31.63322,-125.3449,288.66638,289.13074,0.367188,1.0,77.1875,285.05447,101573.56,...,17.0,802.15784,3.7e-05,20007.979,382.25,1992-01-01,,,,
1,1992-01-01,31.90358,-125.4247,288.66638,289.13074,0.304688,1.0,77.1875,285.05447,101573.56,...,6.0,538.5578,2.7e-05,20007.979,382.25,1992-01-01,,,,
2,1992-01-01,31.70029,-125.0272,288.66638,289.06824,0.304688,1.0,78.8125,285.3123,101573.56,...,4.0,439.7578,2.7e-05,20007.979,382.3125,1992-01-01,,,,
3,1992-01-01,31.97103,-125.1057,288.41638,288.88074,0.210938,1.0,79.6875,285.33572,101573.56,...,0.0,63.85781,1.7e-05,20007.979,381.1875,1992-01-01,,,,
4,1992-01-01,32.24205,-125.1849,288.22888,288.69324,0.210938,1.0,81.6875,285.4998,101573.56,...,0.0,75.157814,1.7e-05,20007.979,380.0625,1992-01-01,,,,


In [9]:
training_data['air.sfc'].isnull().sum()

0

In [10]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24472488 entries, 0 to 24472487
Data columns (total 27 columns):
time_x        datetime64[ns]
lat           float64
lon           float64
air.sfc       float64
air.2m        float64
apcp          float64
crain         float64
rhum.2m       float64
dpt.2m        float64
pres.sfc      float64
uwnd.10m      float64
vwnd.10m      float64
veg           float64
dlwrf         float64
dswrf         float64
lcdc          float64
hcdc          float64
mcdc          float64
hpbl          float64
prate         float64
vis           float64
ulwrf.sfc     float64
date          datetime64[ns]
time_y        float64
size          float64
size_class    object
ignition      float64
dtypes: datetime64[ns](2), float64(24), object(1)
memory usage: 5.1+ GB


In [11]:
training_data.rename(columns={'time_x':'weather_bin_time','time_y':'fire_discovery_time'}, inplace=True)
training_data.drop(['date'], axis=1, inplace=True)
training_data.head()

Unnamed: 0,weather_bin_time,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,...,hcdc,mcdc,hpbl,prate,vis,ulwrf.sfc,fire_discovery_time,size,size_class,ignition
0,1992-01-01,31.63322,-125.3449,288.66638,289.13074,0.367188,1.0,77.1875,285.05447,101573.56,...,100.0,17.0,802.15784,3.7e-05,20007.979,382.25,,,,
1,1992-01-01,31.90358,-125.4247,288.66638,289.13074,0.304688,1.0,77.1875,285.05447,101573.56,...,100.0,6.0,538.5578,2.7e-05,20007.979,382.25,,,,
2,1992-01-01,31.70029,-125.0272,288.66638,289.06824,0.304688,1.0,78.8125,285.3123,101573.56,...,100.0,4.0,439.7578,2.7e-05,20007.979,382.3125,,,,
3,1992-01-01,31.97103,-125.1057,288.41638,288.88074,0.210938,1.0,79.6875,285.33572,101573.56,...,98.0,0.0,63.85781,1.7e-05,20007.979,381.1875,,,,
4,1992-01-01,32.24205,-125.1849,288.22888,288.69324,0.210938,1.0,81.6875,285.4998,101573.56,...,88.0,0.0,75.157814,1.7e-05,20007.979,380.0625,,,,


In [12]:
training_data.describe()

Unnamed: 0,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,...,lcdc,hcdc,mcdc,hpbl,prate,vis,ulwrf.sfc,fire_discovery_time,size,ignition
count,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,...,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,24472490.0,145832.0,417928.0,417928.0
mean,37.03913,-119.3081,288.1954,287.1796,0.1303836,0.07508313,60.5868,277.4135,92624.07,1.64309,...,19.55961,19.4207,18.93989,984.9271,1.193938e-05,18428.25,391.1801,1367.416424,43.521856,1.0
std,3.308083,3.559808,11.55349,8.772353,0.6694179,0.2635254,28.35222,8.480812,8523.757,2.889355,...,33.0374,30.50934,33.48661,962.5278,7.059893e-05,4553.001,64.96561,478.308822,995.85963,0.0
min,31.53068,-125.4445,241.688,246.0385,0.0,0.0,1.0,192.1248,67292.53,-19.00815,...,0.0,0.0,0.0,61.86738,-4.760918e-05,4.35149,195.0,0.0,0.1,1.0
25%,34.14266,-122.395,281.5104,282.5154,0.0,0.0,33.75251,270.6477,84522.07,-0.3400774,...,0.0,0.0,0.0,332.1681,-2.286778e-06,20006.65,356.75,1100.0,0.1,1.0
50%,36.93702,-119.3113,287.9692,287.4987,0.0,0.0,64.73834,277.8078,93835.42,1.612701,...,0.0,5.0,0.0,616.4089,0.0,20008.11,380.8125,1417.0,0.2,1.0
75%,39.8743,-116.2261,292.1091,291.1141,0.0078125,0.0,89.40763,285.1461,101447.4,3.486813,...,32.0,22.0,24.0,1373.996,3.961025e-06,20010.15,406.8125,1701.0,1.0,1.0
max,42.99324,-113.1324,336.582,323.3755,52.41433,1.0,100.0312,304.0451,103756.5,20.32946,...,100.0,100.0,100.0,8151.611,0.006394466,20024.4,713.5,2359.0,106668.0,1.0


In [None]:
training_data.to_csv(training_data_file, index=False)