In [1]:
import pandas as pd

In [2]:
input_data_file = '../data/training_data/1992-2015_training_data.csv'
output_data_file = '../data/training_data/1992-2015_training_data_raw.csv'

dtypes = {
    'weather_bin_time': object,
    'lat': float,
    'lon': float,
    'air.sfc': float,
    'air.2m': float,
    'apcp': float,
    'crain': float,
    'rhum.2m': float,
    'dpt.2m': float,
    'pres.sfc': float,
    'uwnd.10m': float,
    'vwnd.10m': float,
    'veg': float,
    'dlwrf': float,
    'dswrf': float,
    'lcdc': float,
    'hcdc': float,
    'mcdc': float,
    'hpbl': float,
    'prate': float,
    'vis': float,
    'ulwrf.sfc': float,
    'fire_discovery_time': float,
    'size': float,
    'size_class': object,
    'ignition': float
}

output_features = [
    'weather_bin_time', # Note: daily avg. data has no time column, just day, month, year
    'weather_bin_month',
    'weather_bin_year',
    'weather_bin_day',
    'air.2m',
    'apcp',
    'rhum.2m',
    'dpt.2m',
    'pres.sfc',
    'uwnd.10m', 
    'vwnd.10m',
    'veg',
    'vis',
    'lat',
    'lon',
    'ignition'
]

In [3]:
# read data
data = pd.read_csv(input_data_file, dtype=dtypes)

In [4]:
# set ignition to 0 for noxels with no fire
data['ignition'].fillna(0, inplace=True)

# count number of observations in each class
ignition_count = len(data[data["ignition"] == 1])
no_ignition_count = len(data) - ignition_count
print('Total observations: {:.1E}'.format(len(data)))
print('Ignitions count: {:.1E}'.format(ignition_count))
print('Non ignitions count: {:.1E}'.format(no_ignition_count))

Total observations: 2.9E+07
Ignitions count: 1.3E+06
Non ignitions count: 2.8E+07


In [5]:
# clean up raw data
data['weather_bin_time'] = pd.to_datetime(data['weather_bin_time'])
data['weather_bin_day'] = data['weather_bin_time'].dt.day
data['weather_bin_month'] = data['weather_bin_time'].dt.month
data['weather_bin_year'] = data['weather_bin_time'].dt.year
data.drop(['fire_discovery_time', 'size_class', 'size'], axis=1, inplace=True)
data.dropna(inplace=True)

# count number of observations in each class
ignition_count = len(data[data["ignition"] == 1])
no_ignition_count = len(data) - ignition_count
print('Total observations: {:.1E}'.format(len(data)))
print('Ignitions count: {:.1E}'.format(ignition_count))
print('Non ignitions count: {:.1E}'.format(no_ignition_count))

Total observations: 2.9E+07
Ignitions count: 1.3E+06
Non ignitions count: 2.8E+07


In [6]:
output_data = data[output_features]
output_data.to_csv(output_data_file, index=False)