In [17]:
import pandas as pd

In [18]:
weather_data_file = '../data/NOAA_weather_data/2005-2015_california_only_all.csv'
fire_data_file = '../data/USDA_wildfire_data/regridded_california_fires.csv'
training_data_file = '../data/training_data/2005-2015_training_data.csv'

# Parallelization options
N_PROCESSES = 15
JOBS_PER_PROCESS = 1

In [19]:
weather = pd.read_csv(weather_data_file)
weather['time'] = pd.to_datetime(weather['time'])
weather['date'] = weather['time'].dt.date

fires = pd.read_csv(fire_data_file)
fires['ignition'] = 1

In [20]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11978560 entries, 0 to 11978559
Data columns (total 14 columns):
time        datetime64[ns]
lat         float64
lon         float64
air.sfc     float64
air.2m      float64
apcp        float64
crain       float64
rhum.2m     float64
dpt.2m      float64
pres.sfc    float64
uwnd.10m    float64
vwnd.10m    float64
veg         float64
date        object
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 1.2+ GB


In [21]:
fires.head()

Unnamed: 0,lat,lon,date,time,size,size_class,ignition
0,39.93427,-121.1598,2005-02-02,1300.0,0.1,A,1
1,38.8577,-120.5208,2004-05-12,845.0,0.25,A,1
2,39.13992,-120.5874,2004-05-31,1921.0,0.1,A,1
3,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1
4,38.62627,-120.094,2004-06-28,1600.0,0.1,A,1


In [22]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189550 entries, 0 to 189549
Data columns (total 7 columns):
lat           189550 non-null float64
lon           189550 non-null float64
date          189550 non-null object
time          110217 non-null float64
size          189550 non-null float64
size_class    189550 non-null object
ignition      189550 non-null int64
dtypes: float64(4), int64(1), object(2)
memory usage: 10.1+ MB


In [23]:
training_data = pd.merge(weather, fires, on=['lat', 'lon', 'date'], how='outer')

In [24]:
training_data.head()

Unnamed: 0,time_x,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,date,time_y,size,size_class,ignition
0,2005-01-01 00:00:00,40.29749,-124.3408,284.87308,283.2486,1.40625,1.0,77.47234,279.43463,98986.18,5.984521,3.811233,0.0,2005-01-01,,,,
1,2005-01-01 03:00:00,40.29749,-124.3408,284.71484,282.99774,0.78125,1.0,78.82938,279.48016,99001.15,3.470335,2.58194,0.0,2005-01-01,,,,
2,2005-01-01 06:00:00,40.29749,-124.3408,284.62878,282.55124,0.234375,1.0,77.27717,278.75513,99287.914,2.909558,1.104767,0.0,2005-01-01,,,,
3,2005-01-01 09:00:00,40.29749,-124.3408,284.64642,282.59882,1.875,1.0,79.5371,279.22357,99256.03,2.127604,4.75148,0.0,2005-01-01,,,,
4,2005-01-01 12:00:00,40.29749,-124.3408,284.74457,282.18732,5.335938,1.0,80.818474,279.02075,99279.95,2.699432,6.547989,0.0,2005-01-01,,,,


In [25]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12168110 entries, 0 to 12168109
Data columns (total 18 columns):
time_x        datetime64[ns]
lat           float64
lon           float64
air.sfc       float64
air.2m        float64
apcp          float64
crain         float64
rhum.2m       float64
dpt.2m        float64
pres.sfc      float64
uwnd.10m      float64
vwnd.10m      float64
veg           float64
date          object
time_y        float64
size          float64
size_class    object
ignition      float64
dtypes: datetime64[ns](1), float64(15), object(2)
memory usage: 1.7+ GB


In [26]:
training_data.rename(columns={'time_x':'weather_bin_time','time_y':'fire_discovery_time'}, inplace=True)
training_data.drop(['date'], axis=1, inplace=True)
training_data.head()

Unnamed: 0,weather_bin_time,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,fire_discovery_time,size,size_class,ignition
0,2005-01-01 00:00:00,40.29749,-124.3408,284.87308,283.2486,1.40625,1.0,77.47234,279.43463,98986.18,5.984521,3.811233,0.0,,,,
1,2005-01-01 03:00:00,40.29749,-124.3408,284.71484,282.99774,0.78125,1.0,78.82938,279.48016,99001.15,3.470335,2.58194,0.0,,,,
2,2005-01-01 06:00:00,40.29749,-124.3408,284.62878,282.55124,0.234375,1.0,77.27717,278.75513,99287.914,2.909558,1.104767,0.0,,,,
3,2005-01-01 09:00:00,40.29749,-124.3408,284.64642,282.59882,1.875,1.0,79.5371,279.22357,99256.03,2.127604,4.75148,0.0,,,,
4,2005-01-01 12:00:00,40.29749,-124.3408,284.74457,282.18732,5.335938,1.0,80.818474,279.02075,99279.95,2.699432,6.547989,0.0,,,,


In [27]:
training_data.describe()

Unnamed: 0,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,vwnd.10m,veg,fire_discovery_time,size,ignition
count,12168110.0,12168110.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,11978560.0,110217.0,189550.0,189550.0
mean,37.12547,-119.53,288.9527,287.9476,0.1620509,0.05757771,47.86626,274.1508,91531.9,1.059241,0.1075154,30.63439,1383.752742,67.242725,1.0
std,2.556212,2.502004,13.92015,10.23177,0.9353815,0.2329432,26.40591,7.587456,6966.894,2.732453,3.306149,22.75454,490.324275,2029.960435,0.0
min,32.68389,-124.3408,242.2595,246.5075,0.0,0.0,1.0,192.1768,66243.55,-20.30144,-20.00592,0.0,0.0,0.001,1.0
25%,34.99029,-121.4709,278.8525,280.7825,0.0,0.0,25.21898,269.162,87159.47,-0.7443695,-2.100845,8.2,1115.0,0.1,1.0
50%,36.83767,-119.9051,286.5459,286.9761,0.0,0.0,43.79562,274.6586,92329.52,1.0385,0.06468201,30.55,1428.0,0.25,1.0
75%,39.2597,-117.4117,297.4096,294.7501,0.0,0.0,69.7156,279.7777,97086.5,2.748856,2.251846,43.6,1715.0,1.0,1.0
max,41.97182,-114.3645,338.4141,322.9702,41.48438,1.0,100.0312,299.3494,103585.2,21.56605,29.87953,83.7,2359.0,315578.8,1.0


In [28]:
training_data.to_csv(training_data_file, index=False)