In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlite3
from typing import Tuple

#### Step 1. Read initial dataset

In [7]:
cnx = sqlite3.connect('./data/0_origin/fires/FPA_FOD_20170508.sqlite')
fires_all = pd.read_sql_query("SELECT * FROM 'Fires'", cnx)

#### Step 2. Downsampling to reduce the number of records

In [8]:
def show_years_cumsum(fires_all: pd.DataFrame, nb: int = 7) -> None:
    '''
    Shows the number of records (cumulative sum) for the last 'nb' years
    
    Input:
    fires_all (pd.DataFrame) : DataFrame with all fires
    nb (int) : number of years to show
    
    Output:
    None
    '''
    # See number of records per cumulative years (last nb years)
    fires_year = fires_all.groupby("FIRE_YEAR").agg({"FOD_ID": "count"}).sort_values("FIRE_YEAR", ascending=False)
    fires_year = fires_year.cumsum(axis=0)
    fires_year = fires_year.reset_index()
    fires_year.columns = ["FIRE_YEAR", "CUM SUM"]
    display(fires_year.head(nb))

show_years_cumsum(fires_all)

Unnamed: 0,FIRE_YEAR,CUM SUM
0,2015,74491
1,2014,142244
2,2013,207024
3,2012,279793
4,2011,370345
5,2010,450234
6,2009,528559


#### Step 3. Data preparation: compute target values & split open and hidden data

In [9]:
descr2cat = {
    'Lightning': "natural",
    'Structure': "accidental",
    'Fireworks': "accidental",
    'Powerline': "accidental",
    'Railroad': "accidental",
    'Smoking': "accidental",
    'Children': "accidental",
    'Campfire': "accidental",
    'Equipment Use': "accidental",
    'Debris Burning': "accidental",
    'Arson': "criminal",
    'Missing/Undefined': "other",
    'Miscellaneous': "other"
}

cat2id = {
    "natural": 0,
    "accidental": 1,
    "criminal": 2,
    "other": 3
}

def downsampling(fires_all: pd.DataFrame, min_year_train: int) -> pd.DataFrame:
    '''
    Downsampling of all fires records, by keeping only occurences after min_year_train.
    Remove useless columns.
    
    Input:
    fires_all (pd.DataFrame) : DataFrame with all fires
    min_year_train (int) : minimum year used for filtering for the train set, for students
    
    Output:
    (pd.DataFrame) : DataFrame with features and target values after min_year_train
    '''
    # downsampling
    fires = fires_all.loc[fires_all["FIRE_YEAR"] >= min_year_train, :]

    # change target values with only 4 categories (code and description)
    fires["CAUSE_DESCR"] = fires["STAT_CAUSE_DESCR"].map(descr2cat)
    fires["CAUSE_CODE"] = fires["CAUSE_DESCR"].map(cat2id)
    fires = fires.drop(columns=["STAT_CAUSE_CODE", "STAT_CAUSE_DESCR", "OBJECTID", "FPA_ID", "Shape"])
    return fires

def prep_train_test_datasets(fires: pd.DataFrame, min_year_test: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Create the train and test datasets.
    The train dataset contains features and target values (except for the target values after min_year_test).
    The test dataset contains target values after min_year_test.
    
    Input:
    fires (pd.DataFrame) : DataFrame with all fires after min_year_train
    min_year_test (int) : minimum year used for filtering for the test set (source of truth for target values)
    
    Output:
    train (pd.DataFrame) : DataFrame with features starting from min_year_train, and target values (except after min_year_test)
    test (pd.DataFrame) : DataFrame with target values after min_year_test
    dummy_submission (pd.DataFrame) : DataFrame similar to the test, to give the structure of the submission file
    '''

    # create a train set (for students)
    train = fires.copy()
    for c in ['CAUSE_CODE', 'CAUSE_DESCR']:
        train.loc[train["FIRE_YEAR"] >= min_year_test, [c]] = np.nan # replace target values for test by "NaN"

    # create a test set (source of truth)
    test = fires.loc[fires["FIRE_YEAR"] >= min_year_test, ['FOD_ID', 'CAUSE_CODE']].copy()

    # create a dummy sample for submission
    dummy_submission = test.copy()
    dummy_submission["CAUSE_CODE"] = 0
    
    return train, test, dummy_submission

In [11]:
# create train and test datasets
min_year_train = 2011
min_year_test = 2015
fires = downsampling(fires_all, min_year_train)
train, test, dummy_submission = prep_train_test_datasets(fires, min_year_test)

# save results
train.to_csv("./data/1_raw/fires/fires_train.csv", index=False)
test.to_csv("./data/6_hidden/fires_test.csv", index=False)
dummy_submission.to_csv("./data/5_predictions/example_submission.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
