I use the preprocessed dataset with a maximum amount of features. The idea is that you have access the full extent of the raw data with some minimal assumptions/cleaning. eg: removed some columns with > 95% of Nan , cleaned and merged some dates, merged some categorical features accross years.

In [None]:
# @title Downloads, imports and config
!git clone https://github.com/benjamrio/wildfires.git
!pip install -r scikit-optimize
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
RANDOM_SEED = 42

Cloning into 'wildfires'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 58 (delta 16), reused 57 (delta 15), pack-reused 0[K
Receiving objects: 100% (58/58), 15.95 MiB | 8.47 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Updating files: 100% (19/19), done.
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'scikit-optimize'[0m[31m
[0m

## Load Data

In [None]:
df = pd.read_csv("wildfires/data/preprocessed/full_dataset.csv", index_col=0, low_memory=False);
df['total_personnel'] = df.iloc[:,58:128].sum(axis=1);
treatment_features = df.columns[58:-16]
print(f"{df.shape[1]} features\n{df.fire_id.nunique()} fires\n{df.shape[0]} reports");

214 features
5906 fires
38505 reports


In [None]:
df['total_personnel']

283       45
282       45
284       33
285        0
287       21
        ... 
37379    136
37375    136
37376    109
37378    109
37377    109
Name: total_personnel, Length: 38505, dtype: int64

## Watching some fires


In [None]:
# @title Filtering the data { display-mode: "form" }
import ipywidgets as widgets
import plotly.express as px
import numpy as np
import pandas as pd
from IPython.display import display

global filtered_df
filtered_df = pd.DataFrame()


def interactive_filter(num_obs, log_area_range, reports_range, cause):
    global filtered_df
    min_area, max_area = np.power(10, log_area_range)
    min_reports, max_reports = reports_range
    subset_df = df.copy()

    # Cause filter
    if 'No selection' not in cause:
        causes = [int(c) for c in cause]
        subset_df = subset_df[subset_df.cause_id.notna()]
        subset_df = subset_df[subset_df['cause_id'].astype(int).isin(causes)]

    # Fire size filter
    fire_sizes = subset_df.groupby('fire_id').size()
    long_fires = fire_sizes[(fire_sizes >= min_reports) & (fire_sizes <= max_reports)]
    subset_df = subset_df[subset_df.fire_id.isin(long_fires.index)]

    # Fire area filter
    fire_max_area = subset_df.groupby('fire_id')['area'].max()
    large_fires = fire_max_area[(fire_max_area >= min_area) & (fire_max_area <= max_area)]
    subset_df = subset_df[subset_df.fire_id.isin(large_fires.index)]

    # Number of observations
    if len(large_fires) >= num_obs:
        picked_fires = np.random.choice(large_fires.index, size=num_obs, replace=False)
    else:
        picked_fires = large_fires.index


    filtered_df = subset_df[subset_df.fire_id.isin(picked_fires)]

slider_style =  {'description_width': 'initial'}

cause_slider = widgets.SelectMultiple(
    options=['No selection', '1', '2', '3', '4'],
    value=['No selection'],
    description='Cause:',
    disabled=False
)

num_obs_slider = widgets.IntSlider(value=10,
    min=1,
    max=100,
    step=1,
    description='Nb of Observations:',
    style=slider_style
)

log_area_range_slider = widgets.FloatRangeSlider(value=[5, 6],
    min=0,
    max=7,
    step=0.01,
    description='Log Area range:',
    readout_format='.2f',
    style=slider_style
)

reports_range_slider = widgets.IntRangeSlider(
    value=[10, 100],
    min=1,
    max=150,
    step=1,
    description='Nb of reports range:',
    style=slider_style
)


interactive_output = widgets.interactive_output(interactive_filter, {
    'num_obs': num_obs_slider,
    'log_area_range': log_area_range_slider,
    'reports_range': reports_range_slider,
    'cause': cause_slider
})



display(widgets.VBox([cause_slider, log_area_range_slider, reports_range_slider, num_obs_slider, interactive_output]))


VBox(children=(SelectMultiple(description='Cause:', index=(0,), options=('No selection', '1', '2', '3', '4'), …

In [None]:
# @title Fire area evolution : modify variables above and execute this cell { display-mode: "form" }
import plotly.express as px

fig = px.line(filtered_df, x='time_to_first_report', y='area', color='fire_id',
              title='Area Over Time by Fire ID',
              labels={'time_to_first_report': 'Days to First Report', 'area': 'Area'},
              template='plotly_white')


fig.update_traces(mode='markers+lines', marker=dict(size=10, opacity=0.5, line=dict(width=2, color='DarkSlateGrey')))

fig.update_layout(
    xaxis_title='Days to First Report',
    yaxis_title='Area',
    legend_title='Fire ID',
    font=dict(family="Arial, sans-serif", size=12),
    margin=dict(l=60, r=60, t=50, b=50)
)

fig.update_xaxes(showgrid=True, gridwidth=1)
fig.update_yaxes(showgrid=True, gridwidth=1)
fig.show()

In [None]:
fire = 2867919
fire_df = df[df.fire_id==fire]
fire_df.tail()

Unnamed: 0,ID,fire_id,INCIDENT_NUMBER,DONWCGU_PROT_UNIT_IDENTIFIER,INCIDENT_NAME,CAUSE_IDENTIFIER,DISCOVERY_DATE,POO_LATITUDE,POO_LONGITUDE,POO_STATE_CODE,...,next_date,prev_date_diff,next_date_diff,prev_area_diff,next_area_diff,prev_derivate,next_derivate,will_grow,time_to_first_report,total_personnel
7843,16637,2867919,157,1525573.0,NORTH STAR,1536117.0,2015-08-13 17:28:00,48.338056,-119.001667,53.0,...,2015-09-27 16:00:00,1.5,1.75,0.0,0.0,0.0,0.0,False,42.5,326
7846,16637,2867919,157,1525573.0,NORTH STAR,1536117.0,2015-08-13 17:28:00,48.338056,-119.001667,53.0,...,2015-09-28 15:00:00,1.75,0.958333,0.0,0.0,0.0,0.0,False,44.25,157
7847,16637,2867919,157,1525573.0,NORTH STAR,1536117.0,2015-08-13 17:28:00,48.338056,-119.001667,53.0,...,2015-09-29 15:45:00,0.958333,1.03125,0.0,0.0,0.0,0.0,False,45.208333,154
7848,16637,2867919,157,1525573.0,NORTH STAR,1536117.0,2015-08-13 17:28:00,48.338056,-119.001667,53.0,...,2015-11-30 21:30:00,1.03125,62.239583,0.0,0.0,0.0,0.0,False,46.239583,155
7849,16637,2867919,157,1525573.0,NORTH STAR,1536117.0,2015-08-13 17:28:00,48.338056,-119.001667,53.0,...,,62.239583,,0.0,,0.0,,False,108.479167,155


In [None]:
 """WIP"""
(58+df.shape[1]-16)/2
unused_resources = (fire_df.loc[:, treatment_features].sum(axis=0)==0)
print(len(unused_resources))


140


In [None]:
fire_df = df[df.fire_id == df.fire_id.unique()[7]].copy()
fire_df.loc[:, treatment_features].sum()

personnel_ALS Ambulance              0
personnel_Aerial Apparatus           0
personnel_Air Attack                 0
personnel_Air Supply Truck           0
personnel_Airtanker, Type 1          0
                                    ..
quantity_Truck, Hazmat               0
quantity_Underwater SAR              0
quantity_Urban SAR Team              0
quantity_Water Purification Plant    0
quantity_Water Rescue Team           0
Length: 140, dtype: int64

In [None]:
import plotly.graph_objs as go
import plotly.subplots as sp
import warnings

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# Assuming df and fire are defined earlier in your code
fire_df = df[df.fire_id == df.fire_id.unique()[1]].copy()

# Create a subplot with a secondary y-axis
fig = sp.make_subplots(specs=[[{"secondary_y": True}]])

# Add the first line plot for treatment features (primary y-axis)
fig.add_trace(
    go.Scatter(
        x=fire_df['time_to_first_report'],
        y=fire_df[treatment_features],
        mode='markers+lines',
        name='Treatment Features',
        marker=dict(size=10, opacity=0.2, line=dict(width=2, color='DarkSlateGrey'))
    ),
    secondary_y=False
)

# Add the second line plot for area (secondary y-axis)
fig.add_trace(
    go.Scatter(
        x=fire_df['time_to_first_report'],
        y=fire_df['area'],
        mode='markers+lines',
        name='Area',
        marker=dict(size=10, opacity=0.2, line=dict(width=2, color='DarkSlateGrey'))
    ),
    secondary_y=True
)

# Update the layout
fig.update_layout(
    title='Total personnel and Area over Time (beware of the scale)',
    xaxis_title='Time to First Report',
    template='plotly_white'
)

# Update y-axes titles
fig.update_yaxes(title_text="Treatment Features", secondary_y=False)
fig.update_yaxes(title_text="Area", secondary_y=True)

# Show the plot
fig.show()


In [None]:
# @title Area vs suppression
def update_fire_index(figure, direction, total_fires):
    print(direction)
    current_index = figure.layout.title.text.split(' ')[-1]  # Assuming title ends with fire index
    new_index = int(current_index) + direction
    if new_index < 0:
        new_index = 0
    elif new_index >= total_fires:
        new_index = total_fires - 1

    new_fig = create_figure_for_fire(new_index)
    new_fig.layout.title.text = f'Fire {new_index}'
    figure.update(new_fig)

total_fires = len(df.fire_id.unique())

fig = create_figure_for_fire(fire_index)
fig.layout.title.text = f'Fire {df.fire_id.unique()[fire_index]}'

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=[
                dict(
                    args=[{'method': 'update_fire_index', 'direction': -1, 'total_fires': total_fires}],
                    label="<",
                    method="relayout"
                ),
                dict(
                    args=[{'method': 'update_fire_index', 'direction': 1, 'total_fires': total_fires}],
                    label=">",
                    method="relayout"
                )
            ],
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )
    ]
)

fig.show()


In [None]:
import pandas as pd
temp_df = filtered_df.copy()
temp_df['date'] = pd.to_datetime(temp_df['date'])

temp_df.set_index('date', inplace=True)

def resample_group(df_group):
    return df_group.resample('D').first()

resampled_df = temp_df.groupby('fire_id').apply(resample_group)

resampled_df.reset_index(level=0, drop=True, inplace=True)
resampled_df.reset_index(inplace=True)



In [None]:
# @title Map
import pandas as pd
import plotly.express as px

# Your existing code to create the DataFrame `filtered_df` goes here

# Creating an animated scatter map
map_fig = px.scatter_mapbox(resampled_df,
                            lat='POO_LATITUDE',
                            lon='POO_LONGITUDE',
                            size='total_personnel',
                            hover_name='fire_id',
                            animation_frame='time_to_first_report',  # This is the new part for animation
                            zoom=3,
                            title='Fire Locations')

# Updating layout and map style
map_fig.update_layout(mapbox_style='light',
                      margin=dict(l=0, r=0, t=50, b=0))
map_fig.update_layout(mapbox_style="open-street-map")
map_fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Show the animated map
map_fig.show()


**TODO**

* plot area and treatment on same graph + deduce simpler treatment features

* create map and time animation (slider to study the propagation : spread, treatment each day)?

* study and delete long stationary (and sometimes sparse) report tails? for now date = mean("REPORT_FROM_DATE"+"REPORT_TO_DATE"), this could be improved

* create and select clusters : fires that grow fast, fires that don't grow, fires that get signficantly smaller at some point, ...


## A few aggregated visualizations

In [None]:
corrs = df.loc[:, treatment_features].corr()
fig = px.imshow(corrs, width=800, height=800)
fig.show()

In [None]:
corrs = df.iloc[:, -14:].corr()
rounded = np.around(corrs, decimals=2)
fig = px.imshow(corrs, width=800, height=800)
fig.show()





Initially, resources datasets had 3 variables : resource, resource_quantity, resource_personnel. I pivoted the table.

In [None]:
study = df.copy()
study['total_personnel'] = df.iloc[:,58:128].sum(axis=1)
study = study.loc[study.cause_id.notna(),:]
study['cause_id'] = study.cause_id.astype('int').astype('str')
fig = px.scatter(study, x='area', y='total_personnel', color="cause_id", log_x=True, log_y=True)
fig.show()

## Prediction

In [None]:
from wildfires.train import split_X_y,split_train_test, create_task_dataset

In [None]:
cat_features = ["cause_id","month","year","STATUS"]
num_features = ["report_number", "area", "prev_area_diff"]
target = "next_area_diff"
id_cols = ["report_id", "fire_id"]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
task_df, encoder = create_task_dataset(df, id_cols, cat_features, num_features, target)
non_id_cols = [col for col in task_df.columns if col not in id_cols]
print(len(task_df))
task_df = task_df.dropna()
print(f"After dropna: {len(task_df)}") # for Linear Regression
train_df, test_df = split_train_test(
    task_df,
    test_size=0.2,
    group_id="fire_id",
    random_state=42,
    save_datasets=False,
)

Identified 4 categorical columns: 
['cause_id', 'month', 'year', 'STATUS']
32599
After dropna: 27966


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def compute_baseline_scores(train_df, test_df, target):
    baseline_pred = train_df[target].mean()
    metrics = {}
    for df, split in zip((train_df, test_df), ('train', 'test')):
        y_true = df[target]
        y_pred = np.full_like(y_true, baseline_pred)
        metrics[f'{split}_rmse'] = mean_squared_error(y_true, y_pred, squared=False)
        metrics[f'{split}_mae'] = mean_absolute_error(y_true, y_pred)
    return metrics

scores = compute_baseline_scores(train_df, test_df, target)
scores

{'train_rmse': 6490.2485739658605,
 'train_mae': 1074.3580099880985,
 'test_rmse': 7434.291388754063,
 'test_mae': 1261.212488271425}

In [None]:
X_train, y_train = split_X_y(train_df, id_cols, target)
X_test, y_test = split_X_y(test_df, id_cols, target)
model = LinearRegression()

model.fit(X_train, y_train)

def compute_scores(X_train, y_train, X_test, y_test, model):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    return {
        'train_rmse': mean_squared_error(y_train, train_pred, squared=False),
        'train_mae': mean_absolute_error(y_train, train_pred),
        'test_rmse': mean_squared_error(y_test, test_pred, squared=False),
        'test_mae': mean_absolute_error(y_test, test_pred)
    }

model_scores = compute_scores(X_train, y_train, X_test, y_test, model)
model_scores

{'train_rmse': 6477.658987265738,
 'train_mae': 1093.7707770266215,
 'test_rmse': 7427.729259888642,
 'test_mae': 1279.990270605334}