In [1]:
import pandas as pd

In [2]:
data_file = '../data/training_data/1992-1997_training_data.csv'
output_file_rolling_window = '../data/training_data/1992-1997_training_data_rolling_window.csv'
output_file_daily_mean = '../data/training_data/1992-1997_training_data_daily_mean.csv'

In [3]:
# read data
data = pd.read_csv(data_file, low_memory=False)

# set ignition to 0 for noxels with no fire
data['ignition'].fillna(0, inplace=True)

# count number of observations in each class
ignition_count = len(data[data["ignition"] == 1])
no_ignition_count = len(data) - ignition_count
print('Total observations: {:.1E}'.format(len(data)))
print('Ignitions count: {:.1E}'.format(ignition_count))
print('Non ignitions count: {:.1E}'.format(no_ignition_count))

Total observations: 7.3E+06
Ignitions count: 3.7E+05
Non ignitions count: 6.9E+06


OK, so before we dig into this much farther, I am noticing an issue. For some fires we know the discovery time and some we do not. I can imagine a few ways of dealing with this:

Throw out fires for which we do not know the discovery time
Average everything at the day resolution level
I am leaning toward option two because:

Discovery time is not ignition time so the weather state during that exact hour may not be pertinent anyway - for example: what if the fire had already been burning for hours and the temperature had changed drastically?
The data is already skewed toward no ignition observations. I would rather not have to throw out even more fires. Also, doing so will mean that I have noxels where there was a fire which was not included in the dataset.
Specific plan is to roll a daily moving average across the data, including ignition value. This will assign an ignition value to all noxels within 24 hr. of a fire's discovery 

In [4]:
# roll data with 24 hr windowed mean
data_rolling_window = data.groupby(['lat', 'lon']).rolling(24, on="weather_bin_time").mean()
data_rolling_window.reset_index(drop=True, inplace=True)

# clean up dataframe
data_rolling_window['weather_bin_time'] = pd.to_datetime(data_rolling_window['weather_bin_time'])
data_rolling_window['weather_bin_day'] = data_rolling_window['weather_bin_time'].dt.day
data_rolling_window['weather_bin_month'] = data_rolling_window['weather_bin_time'].dt.month
data_rolling_window['weather_bin_year'] = data_rolling_window['weather_bin_time'].dt.year
data_rolling_window.drop(['fire_discovery_time', 'size', 'size_class', 'weather_bin_time'], axis=1, inplace=True)
data_rolling_window.dropna(inplace=True)

print('Total observations: {:.1E}'.format(len(data_rolling_window)))

Total observations: 7.3E+06


In [5]:
data_rolling_window.to_csv(output_file_rolling_window, index=False)

In [7]:
data_daily_mean = data_rolling_window.groupby([
    'lat', 
    'lon', 
    'weather_bin_year',
    'weather_bin_month',
    'weather_bin_day'
]).mean()

data_daily_mean.reset_index(drop=True, inplace=True)

print('Total observations: {:.1E}'.format(len(data_daily_mean)))

Total observations: 9.0E+05


In [8]:
data_daily_mean.to_csv(output_file_daily_mean, index=False)