In [1]:
import pandas as pd
import numpy as np

In [2]:
weather_data_file = '../data/NOAA_weather_data/1992-2015_california_only_all.csv'
fire_data_file = '../data/USDA_wildfire_data/regridded_california_fires.csv'
training_data_file = '../data/training_data/1992-2015_training_data_only.csv'

In [3]:
weather = pd.read_csv(weather_data_file)
weather['time'] = pd.to_datetime(weather['time'])
weather['date'] = weather['time'].dt.date
weather['date'] = pd.to_datetime(weather['date'])

fires = pd.read_csv(fire_data_file)
fires['ignition'] = 1
fires['date'] = pd.to_datetime(fires['date'])

In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28752480 entries, 0 to 28752479
Data columns (total 23 columns):
time         datetime64[ns]
lat          float64
lon          float64
air.sfc      float64
air.2m       float64
apcp         float64
crain        float64
rhum.2m      float64
dpt.2m       float64
pres.sfc     float64
uwnd.10m     float64
vwnd.10m     float64
veg          float64
dlwrf        float64
dswrf        float64
lcdc         float64
hcdc         float64
mcdc         float64
hpbl         float64
prate        float64
vis          float64
ulwrf.sfc    float64
date         datetime64[ns]
dtypes: datetime64[ns](2), float64(21)
memory usage: 4.9 GB


In [5]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189550 entries, 0 to 189549
Data columns (total 7 columns):
lat           189550 non-null float64
lon           189550 non-null float64
date          189550 non-null datetime64[ns]
time          110217 non-null float64
size          189550 non-null float64
size_class    189550 non-null object
ignition      189550 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 10.1+ MB


In [6]:
weather['time'].values

array(['1992-01-01T00:00:00.000000000', '1992-01-01T00:00:00.000000000',
       '1992-01-01T00:00:00.000000000', ...,
       '2015-12-31T21:00:00.000000000', '2015-12-31T21:00:00.000000000',
       '2015-12-31T21:00:00.000000000'], dtype='datetime64[ns]')

In [8]:
fires['date'].values

array(['2005-02-02T00:00:00.000000000', '2004-05-12T00:00:00.000000000',
       '2004-05-31T00:00:00.000000000', ...,
       '2015-05-02T00:00:00.000000000', '2015-10-14T00:00:00.000000000',
       '2015-03-14T00:00:00.000000000'], dtype='datetime64[ns]')

In [9]:
training_data = pd.merge(weather, fires, on=['lat', 'lon', 'date'], how='left')

In [10]:
training_data.head()

Unnamed: 0,time_x,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,...,mcdc,hpbl,prate,vis,ulwrf.sfc,date,time_y,size,size_class,ignition
0,1992-01-01,40.29749,-124.3408,284.35388,284.81824,0.0,0.0,88.3125,282.8904,100173.56,...,0.0,533.4578,-3e-06,20007.979,359.9375,1992-01-01,,,,
1,1992-01-01,38.96153,-123.5579,285.66638,286.25574,0.0,0.0,67.9375,280.4373,99073.56,...,0.0,671.8578,-3e-06,20007.979,353.875,1992-01-01,,,,
2,1992-01-01,39.2415,-123.6393,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99573.56,...,0.0,786.9578,-3e-06,20007.979,349.4375,1992-01-01,,,,
3,1992-01-01,39.52163,-123.7215,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99473.56,...,0.0,917.4578,-3e-06,20007.979,349.4375,1992-01-01,,,,
4,1992-01-01,39.80193,-123.8045,283.35388,284.06824,0.0,0.0,68.8125,278.55447,96573.56,...,0.0,833.7578,-3e-06,20007.979,341.625,1992-01-01,,,,


In [11]:
training_data['air.sfc'].isnull().sum()

0

In [12]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29039328 entries, 0 to 29039327
Data columns (total 27 columns):
time_x        datetime64[ns]
lat           float64
lon           float64
air.sfc       float64
air.2m        float64
apcp          float64
crain         float64
rhum.2m       float64
dpt.2m        float64
pres.sfc      float64
uwnd.10m      float64
vwnd.10m      float64
veg           float64
dlwrf         float64
dswrf         float64
lcdc          float64
hcdc          float64
mcdc          float64
hpbl          float64
prate         float64
vis           float64
ulwrf.sfc     float64
date          datetime64[ns]
time_y        float64
size          float64
size_class    object
ignition      float64
dtypes: datetime64[ns](2), float64(24), object(1)
memory usage: 6.1+ GB


In [13]:
training_data.rename(columns={'time_x':'weather_bin_time','time_y':'fire_discovery_time'}, inplace=True)
training_data.drop(['date'], axis=1, inplace=True)
training_data.head()

Unnamed: 0,weather_bin_time,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,...,hcdc,mcdc,hpbl,prate,vis,ulwrf.sfc,fire_discovery_time,size,size_class,ignition
0,1992-01-01,40.29749,-124.3408,284.35388,284.81824,0.0,0.0,88.3125,282.8904,100173.56,...,11.0,0.0,533.4578,-3e-06,20007.979,359.9375,,,,
1,1992-01-01,38.96153,-123.5579,285.66638,286.25574,0.0,0.0,67.9375,280.4373,99073.56,...,6.0,0.0,671.8578,-3e-06,20007.979,353.875,,,,
2,1992-01-01,39.2415,-123.6393,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99573.56,...,6.0,0.0,786.9578,-3e-06,20007.979,349.4375,,,,
3,1992-01-01,39.52163,-123.7215,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99473.56,...,6.0,0.0,917.4578,-3e-06,20007.979,349.4375,,,,
4,1992-01-01,39.80193,-123.8045,283.35388,284.06824,0.0,0.0,68.8125,278.55447,96573.56,...,6.0,0.0,833.7578,-3e-06,20007.979,341.625,,,,


In [14]:
training_data.describe()

Unnamed: 0,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,uwnd.10m,...,lcdc,hcdc,mcdc,hpbl,prate,vis,ulwrf.sfc,fire_discovery_time,size,ignition
count,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,...,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,29039330.0,781400.0,1333544.0,1333544.0
mean,37.12666,-119.5271,289.08,288.0071,0.1737499,0.0641655,49.18569,274.765,91530.56,1.057519,...,11.40367,16.41086,16.02698,1092.735,1.582629e-05,18419.71,401.2357,1383.964361,65.42902,1.0
std,2.558607,2.504259,13.7735,10.09739,0.9301872,0.2450475,26.19865,7.354758,6963.395,2.683459,...,26.523,27.32012,30.90598,871.092,9.346681e-05,4589.82,78.45877,492.268955,1922.461,0.0
min,32.68389,-124.3408,242.2595,246.5075,0.0,0.0,1.0,192.1248,66243.55,-20.30144,...,0.0,0.0,0.0,61.94956,-4.861644e-05,3.608097,195.0625,0.0,0.001,1.0
25%,34.99029,-121.4709,279.1194,280.9448,0.0,0.0,26.69481,269.9836,87153.34,-0.717453,...,0.0,0.0,0.0,375.0546,-2.251263e-06,20006.06,345.3125,1115.0,0.1,1.0
50%,36.83767,-119.9051,286.7109,287.0564,0.0,0.0,45.53017,275.3505,92335.06,1.052383,...,0.0,4.0,0.0,878.2482,0.0,20007.69,381.5,1429.0,0.25,1.0
75%,39.2597,-117.4117,297.3691,294.7252,0.001921475,0.0,71.23541,280.2059,97079.84,2.734268,...,0.0,18.0,13.0,1552.788,3.701515e-06,20010.16,443.6875,1716.0,1.0,1.0
max,41.97182,-114.3645,338.4141,323.3755,52.41433,1.0,100.0312,299.3494,103585.2,21.56605,...,100.0,100.0,100.0,7985.581,0.005932878,20025.81,733.875,2359.0,280059.0,1.0


In [15]:
training_data.to_csv(training_data_file, index=False)