In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#### Step 1. Cleaning, Feature Engineering and Encoding

In [7]:
def feature_engineering_external_data(external_data: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from the external_data dataset.

    Input:
    external_data (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame): DataFrame with additional features
    '''
    external_data = external_data.groupby(["STATE_CODE", "Date"]).agg({
        "tmax":["mean", "max", "min"],
        "tmin":["mean", "max", "min"],
        "prcp":["mean", "max", "min"],
        "Median Age":"mean",
        "Total Population": "mean",
        "Average Household Size": "mean"
    }).reset_index()
    kpis = [
        "tmax_mean", "tmax_max", "tmax_min", "tmin_mean",
        "tmin_max", "tmin_min", "prcp_mean", "prcp_max",
        "prcp_min", "Median Age_mean", "Total Population_mean",
        "Average Household Size_mean"
    ]
    external_data.columns = ["STATE_CODE", "Date", *kpis]

    external_data["delta_t"] = external_data["tmax_mean"] - external_data["tmin_mean"]

    return external_data


def feature_engineering_fires(fires: pd.DataFrame, fires_days: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from the fires dataset.

    Input:
    external_data (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame): DataFrame with additional features
    '''
    # count the number of fire per day and state
    fires_count = fires.groupby(["DISCOVERY_DATE", "STATE"]).agg({"FOD_ID":"count"}).reset_index()
    fires_count.columns = ["DISCOVERY_DATE", "STATE", "FIRE_COUNT"]
    combinations = fires_days[["DISCOVERY_DATE", "STATE"]].copy()
    fires_count = pd.merge(combinations, fires_count, how="left", on=["DISCOVERY_DATE", "STATE"])
    fires_count["FIRE_COUNT"] = fires_count["FIRE_COUNT"].fillna(0).astype("int")

    # number of fires at t - 1 year, on a 1 month window, 1 week window and 1 day window
    fires_count["FIRE_COUNT_Y1_M1"] = fires_count.groupby(["STATE"])[["FIRE_COUNT"]].transform(lambda x: x.shift(365).rolling(30).sum())
    fires_count["FIRE_COUNT_Y1_W1"] = fires_count.groupby(["STATE"])[["FIRE_COUNT"]].transform(lambda x: x.shift(365).rolling(7).sum())
    fires_count["FIRE_COUNT_Y1_D1"] = fires_count.groupby(["STATE"])[["FIRE_COUNT"]].transform(lambda x: x.shift(365))
    return fires_count


def feature_engineering_all(df:pd.DataFrame) -> pd.DataFrame:
    '''
    Encode categorical variables and fill na values
    '''
    # add dates
    df["DISCOVERY_DOW"] = df["DISCOVERY_DATE"].dt.dayofweek
    df["DISCOVERY_MONTH"] = df["DISCOVERY_DATE"].dt.month
    df["DISCOVERY_DAY"] = df["DISCOVERY_DATE"].dt.day
    # drop unused columns
    df.drop(columns=["Date", "STATE_CODE", "FIRE_COUNT"], inplace=True)
    # fillna kpis from external data
    cols_fillna = [
        "tmax_mean", "tmax_max", "tmax_min", "tmin_mean",
        "tmin_max", "tmin_min", "prcp_mean",
        "prcp_max", "prcp_min", "delta_t",
        "Median Age_mean", "Total Population_mean",
        "Average Household Size_mean", "FIRE_COUNT_Y1_M1",
        "FIRE_COUNT_Y1_W1", "FIRE_COUNT_Y1_D1"
    ]
    for c in cols_fillna:
        df[c] = df[c].fillna(df[c].mean())

    # encode state
    data_cat = pd.get_dummies(df["STATE"])
    df = pd.concat([df, data_cat], axis=1)

    return df

In [8]:
# train_set
fires_days = pd.read_csv("./data/1_raw/fires/fires_days_train.csv", parse_dates=["DISCOVERY_DATE"])

# external_data
external_data = pd.read_csv("./data/2_clean/external_data.csv", parse_dates=["Date"])
external_data = feature_engineering_external_data(external_data)

# fires data
fires = pd.read_csv("./data/2_clean/fires.csv", parse_dates=["DISCOVERY_DATE"])
fires_count = feature_engineering_fires(fires, fires_days)

# merge
fires_days = pd.merge(fires_days, external_data, how="left", left_on=["STATE", "DISCOVERY_DATE"], right_on=["STATE_CODE", "Date"])
fires_days = pd.merge(fires_days, fires_count, how="left", on=["STATE", "DISCOVERY_DATE"])

# cleaning and encoding
fires_days = feature_engineering_all(fires_days)

# save to csv format
fires_days.to_csv("./data/3_input_model/input_model.csv", index=False)