In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from utils import check_duplicates

#### Step 1. Cleaning, Feature Engineering and Encoding

In [3]:
def cleaning(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Clean the dataset.

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame): cleaned DataFrame
    '''
    # TODO remove DURATION, FIRE_CODE, FIRE_NAME, FIPS_CODE, COUNTY, FIPS_NAME
    drop_cols = ["FIRE_CODE", "FIRE_NAME", "DISCOVERY_TIME", "CONT_DATE", "CONT_TIME", "FIRE_SIZE_CLASS", "COUNTY", "FIPS_CODE", "FIPS_NAME", "CLOSEST_CITY", "State", "STATE_CODE", "DURATION", "CAUSE_DESCR"]
    fires = fires.drop(columns=drop_cols)

    # na values for temperatures
    for c in ["tmax", "tmin", "prcp", "Total Population", "Average Household Size", "Median Age"]:
        fires[c] = fires[c].fillna(fires[c].mean())
        
    return fires


def feature_engineering(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from the Fires dataset.

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with additional features
    '''
    # dates
    fires["DISCOVERY_DOW"] = fires["DISCOVERY_DATE"].dt.dayofweek
    fires["DISCOVERY_MONTH"] = fires["DISCOVERY_DATE"].dt.month
    fires["DISCOVERY_DAY"] = fires["DISCOVERY_DATE"].dt.day
    # fires = fires.drop(columns=["DISCOVERY_DATE"])

    return fires


def encoding(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Encode categorical variables

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with encoded categorical variables
    '''
    # encode states
    data_cat = pd.get_dummies(fires["STATE"])
    fires = pd.concat([fires, data_cat], axis=1)
    fires = fires.drop(columns=["STATE"])

    return fires

In [4]:
# read file and process it
fires = pd.read_csv("./data/3_merge/merged_data_haversine.csv", parse_dates=["DISCOVERY_DATE", "CONT_DATE"])
fires = cleaning(fires)
fires = feature_engineering(fires)
fires = encoding(fires)
fires.to_csv("./data/4_input_model/model_1.csv", index=False)

  interactivity=interactivity, compiler=compiler, result=result)
