In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from utils import check_duplicates

#### Step 1. Cleaning, Feature Engineering and Encoding

In [20]:
# train_set
fires_days = pd.read_csv("./data/1_raw/fires/fires_days_train.csv", parse_dates=["DISCOVERY_DATE"])

# external_data
external_data = pd.read_csv("./data/2_clean/external_data.csv", parse_dates=["Date"])
external_data = external_data.groupby(["STATE_CODE", "Date"]).agg({"tmax":["mean", "max", "min"], "tmin":["mean", "max", "min"], "prcp":["mean", "max", "min"]}).reset_index()
kpis = ["tmax_mean", "tmax_max", "tmax_min", "tmin_mean", "tmin_max", "tmin_min", "prcp_mean", "prcp_max", "prcp_min"]
external_data.columns = ["STATE_CODE", "Date", *kpis]

# merge
fires_days = pd.merge(fires_days, external_data, how="left", left_on=["STATE", "DISCOVERY_DATE"], right_on=["STATE_CODE", "Date"])

In [21]:
fires_days

Unnamed: 0,DISCOVERY_DATE,STATE,FIRE,STATE_CODE,Date,tmax_mean,tmax_max,tmax_min,tmin_mean,tmin_max,tmin_min,prcp_mean,prcp_max,prcp_min
0,2011-01-01,AK,0.0,,NaT,,,,,,,,,
1,2011-01-01,MN,0.0,MN,2011-01-01,14.0,26.0,7.0,0.333333,1.0,0.0,0.006667,0.02,0.00
2,2011-01-01,MI,0.0,MI,2011-01-01,54.0,55.0,53.0,20.500000,24.0,19.0,0.182500,0.46,0.07
3,2011-01-01,MO,1.0,MO,2011-01-01,27.0,27.0,27.0,11.000000,11.0,11.0,0.000000,0.00,0.00
4,2011-01-01,IL,0.0,IL,2011-01-01,38.0,41.0,35.0,16.250000,21.0,13.0,0.000000,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94947,2015-12-31,PR,,,NaT,,,,,,,,,
94948,2015-12-31,RI,,,NaT,,,,,,,,,
94949,2015-12-31,VT,,,NaT,,,,,,,,,
94950,2015-12-31,MA,,MA,2015-12-31,43.5,47.0,40.0,32.500000,33.0,32.0,0.080000,0.08,0.08


In [35]:
def cleaning(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Clean the dataset.

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame): cleaned DataFrame
    '''
    # TODO remove DURATION, FIRE_CODE, FIRE_NAME, FIPS_CODE, COUNTY, FIPS_NAME
    drop_cols = ["FIRE_CODE", "FIRE_NAME", "DISCOVERY_TIME", "CONT_DATE", "CONT_TIME", "FIRE_SIZE_CLASS", "COUNTY", "FIPS_CODE", "FIPS_NAME", "CLOSEST_CITY", "State", "STATE_CODE", "DURATION", "CAUSE_DESCR"]
    fires = fires.drop(columns=drop_cols)

    # na values for temperatures
    for c in ["tmax", "tmin", "prcp", "Total Population", "Average Household Size", "Median Age"]:
        fires[c] = fires[c].fillna(fires[c].mean())
        
    return fires


def feature_engineering(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from the Fires dataset.

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with additional features
    '''
    # dates
    fires["DISCOVERY_DOW"] = fires["DISCOVERY_DATE"].dt.dayofweek
    fires["DISCOVERY_MONTH"] = fires["DISCOVERY_DATE"].dt.month
    fires["DISCOVERY_DAY"] = fires["DISCOVERY_DATE"].dt.day
    fires = fires.drop(columns=["DISCOVERY_DATE"])

    return fires


def encoding(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Encode categorical variables

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with encoded categorical variables
    '''
    # encode states
    data_cat = pd.get_dummies(fires["STATE"])
    fires = pd.concat([fires, data_cat], axis=1)
    fires = fires.drop(columns=["STATE"])

    return fires

In [36]:
# read file and process it
fires = pd.read_csv("./data/3_merge/merged_data_haversine.csv", parse_dates=["DISCOVERY_DATE", "CONT_DATE"])
fires = cleaning(fires)
fires = feature_engineering(fires)
fires = encoding(fires)
fires.to_csv("./data/4_input_model/model_1.csv", index=False)

  interactivity=interactivity, compiler=compiler, result=result)
