In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlite3
from typing import Tuple

#### Step 1. Read initial dataset

In [36]:
cnx = sqlite3.connect('./data/0_origin/fires/FPA_FOD_20170508.sqlite')
fires_all = pd.read_sql_query("SELECT * FROM 'Fires'", cnx)

#### Step 2. Downsampling to reduce the number of records

In [37]:
def show_years_cumsum(fires_all: pd.DataFrame, nb: int = 7) -> None:
    '''
    Shows the number of records (cumulative sum) for the last 'nb' years
    
    Input:
    fires_all (pd.DataFrame) : DataFrame with all fires
    nb (int) : number of years to show
    
    Output:
    None
    '''
    # See number of records per cumulative years (last nb years)
    fires_year = fires_all.groupby("FIRE_YEAR").agg({"FOD_ID": "count"}).sort_values("FIRE_YEAR", ascending=False)
    fires_year = fires_year.cumsum(axis=0)
    fires_year = fires_year.reset_index()
    fires_year.columns = ["FIRE_YEAR", "CUM SUM"]
    display(fires_year.head(nb))

In [38]:
show_years_cumsum(fires_all)

Unnamed: 0,FIRE_YEAR,CUM SUM
0,2015,74491
1,2014,142244
2,2013,207024
3,2012,279793
4,2011,370345
5,2010,450234
6,2009,528559


#### Step 3. Downsampling and tranformation

In [39]:
DESCR2CAT = {
    'Lightning': "natural",
    'Structure': "accidental",
    'Fireworks': "accidental",
    'Powerline': "accidental",
    'Railroad': "accidental",
    'Smoking': "accidental",
    'Children': "accidental",
    'Campfire': "accidental",
    'Equipment Use': "accidental",
    'Debris Burning': "accidental",
    'Arson': "criminal",
    'Missing/Undefined': "other",
    'Miscellaneous': "other"
}

CAT2ID = {
    "natural": 0,
    "accidental": 1,
    "criminal": 2,
    "other": 3
}

cols_drop = [
    "OBJECTID",
    "FPA_ID",
    "Shape",
    "FIRE_CODE",
    "MTBS_ID",
    "MTBS_FIRE_NAME",
    "COMPLEX_NAME",
    "DISCOVERY_DOY",
    "CONT_DOY",
    "COUNTY",
    "FIPS_CODE",
    "FIPS_NAME",
    "SOURCE_SYSTEM_TYPE",
    "SOURCE_SYSTEM",
    "NWCG_REPORTING_AGENCY",
    "NWCG_REPORTING_UNIT_ID",
    "NWCG_REPORTING_UNIT_NAME",
    "SOURCE_REPORTING_UNIT",
    "SOURCE_REPORTING_UNIT_NAME",
    "LOCAL_FIRE_REPORT_ID",
    "LOCAL_INCIDENT_ID",
    "OWNER_CODE",
    "OWNER_DESCR",
    "ICS_209_INCIDENT_NUMBER",
    "ICS_209_NAME",
]

def downsampling_transform_fires(fires_all: pd.DataFrame, min_year_train: int, cols_drop:list) -> pd.DataFrame:
    '''
    Downsampling of all fires records, by keeping only occurences after min_year_train.
    Remove useless columns.
    
    Input:
    fires_all (pd.DataFrame) : DataFrame with all fires
    min_year_train (int) : minimum year used for filtering for the train set, for students
    cols_drop (list) : columns to drop from the original DataFrame
    
    Output:
    (pd.DataFrame) : Filtered and transformed DataFrame
    '''
    # downsampling
    fires = fires_all.loc[fires_all["FIRE_YEAR"] >= min_year_train, :]

    # change target values with only 4 categories (code and description)
    fires["CAUSE_DESCR"] = fires["STAT_CAUSE_DESCR"].map(DESCR2CAT)
    fires["CAUSE_CODE"] = fires["CAUSE_DESCR"].map(CAT2ID)

    # remove useless columns
    fires = fires.drop(columns=["STAT_CAUSE_CODE", "STAT_CAUSE_DESCR", *cols_drop])
    
    return fires

#### Step 1. Prepare target value

In [40]:
def define_target_value(fires:pd.DataFrame, sizes:list, states:pd.DataFrame, min_date:str, max_date:str) -> pd.DataFrame:
    '''
    Define target value for the second model, which predicts the appearance of a fire at a given date for a given state.
    The final DataFrame contains one row per state and day, and a target value "FIRE":
    - equals to 1 if there is a fire
    - equals to 0 otherwise
    
    Input:
    fires (pd.DataFrame) : DataFrame with all fires
    sizes (list) : list of all fires sizes to consider
    states (pd.DataFrame) : a DataFrame with one column corresponding to all US states
    min_date (str) : minimum date for the output DataFrame
    max_date (str) : maximum date for the output DataFrame
    
    Output:
    (pd.DataFrame) : output DataFrame with the target value
    '''
    # aggregate per state and day
    fires = fires.loc[fires["FIRE_SIZE_CLASS"].isin(sizes), :] # keep selected sizes
    fires["DISCOVERY_DATE"] = pd.to_datetime(fires["DISCOVERY_DATE"] - pd.Timestamp(0).to_julian_date(), unit='D')
    fires_day = fires.groupby(["STATE", "DISCOVERY_DATE"]).agg({"FOD_ID": "count"}).reset_index()
    fires_day["FIRE"] = (fires_day["FOD_ID"] > 0).astype("int")
    fires_day.drop(columns=["FOD_ID"], inplace=True)

    # compute all combinations
    dates = pd.DataFrame(
        pd.date_range(min_date,max_date,freq='d'),
        columns=["DISCOVERY_DATE"]
    )
    combinations = dates.merge(states, how="cross")

    # merge all combinations with fires
    fires = pd.merge(combinations, fires_day, how="left", on=["STATE", "DISCOVERY_DATE"])
    fires["FIRE"] = fires["FIRE"].fillna(0).astype("int")
    return fires

In [41]:
def prep_train_test_datasets(fires: pd.DataFrame, min_year_test: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Create the train and test datasets.
    The train dataset contains features and target values (except for the target values after min_year_test).
    The test dataset contains target values after min_year_test.
    
    Input:
    fires (pd.DataFrame) : DataFrame with all fires after min_year_train
    min_year_test (int) : minimum year used for filtering for the test set (source of truth for target values)
    
    Output:
    train (pd.DataFrame) : DataFrame with features starting from min_year_train, and target values (except after min_year_test)
    test (pd.DataFrame) : DataFrame with target values after min_year_test
    dummy_submission (pd.DataFrame) : DataFrame similar to the test, to give the structure of the submission file
    '''

    # create a train set (for students)
    train = fires.copy()
    train.loc[train["DISCOVERY_DATE"] >= min_year_test, ["FIRE"]] = np.nan # replace target values for test by "NaN"

    # create a test set (source of truth)
    test = fires.loc[fires["DISCOVERY_DATE"] >= min_year_test, ['FIRE']].copy()

    # create a dummy sample for submission
    dummy_submission = test.copy()
    dummy_submission["FIRE"] = 0
    
    return train, test, dummy_submission

In [42]:
# inputs
sizes=["B", "C", "D", "E", "F", "G"]
min_year_train = 2011
max_year_train = 2014
min_date = "2011-01-01"
max_date = "2015-12-31"
min_year_test = "2015-01-01"

# create train and test datasets
fires = downsampling_transform_fires(fires_all, min_year_train, cols_drop)
states = fires[["STATE"]].drop_duplicates()
fires_train = fires[fires["FIRE_YEAR"] <= max_year_train].copy()

# get train, test and dummy_submission files
fires = define_target_value(fires, sizes, states, min_date, max_date)
train, test, dummy_submission = prep_train_test_datasets(fires, min_year_test)

# save results
fires_train.to_csv("./data/1_raw/fires/fires.csv", index=False)
train.to_csv("./data/1_raw/fires/fires_days_train.csv", index=False)
test.to_csv("./data/5_hidden/fires_days_test.csv", index=False)
dummy_submission.to_csv("./data/4_predictions/example_submission.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
