In [1]:
import pandas as pd
import numpy as np
from typing import Tuple

#### Step 1. Prepare target value

In [2]:
def define_target_value(fires:pd.DataFrame, sizes:list, states:pd.DataFrame, min_date:str, max_date:str) -> pd.DataFrame:
    '''
    Define target value for the second model, which predicts the appearance of a fire at a given date for a given state.
    The final DataFrame contains one row per state and day, and a target value "FIRE":
    - equals to 1 if there is a fire
    - equals to 0 otherwise
    
    Input:
    fires (pd.DataFrame) : DataFrame with all fires
    sizes (list) : list of all fires sizes to consider
    states (pd.DataFrame) : a DataFrame with one column corresponding to all US states
    min_date (str) : minimum date for the output DataFrame
    max_date (str) : maximum date for the output DataFrame
    
    Output:
    (pd.DataFrame) : output DataFrame with the target value
    '''
    # aggregate per state and day
    fires = fires.loc[fires["FIRE_SIZE_CLASS"].isin(sizes), :] # keep selected sizes
    fires["DISCOVERY_DATE"] = pd.to_datetime(fires["DISCOVERY_DATE"] - pd.Timestamp(0).to_julian_date(), unit='D')
    fires_day = fires.groupby(["STATE", "DISCOVERY_DATE"]).agg({"FOD_ID": "count"}).reset_index()
    fires_day["FIRE"] = (fires_day["FOD_ID"] > 0).astype("int")
    fires_day.drop(columns=["FOD_ID"], inplace=True)

    # compute all combinations
    dates = pd.DataFrame(
        pd.date_range(min_date,max_date,freq='d'),
        columns=["DISCOVERY_DATE"]
    )
    combinations = dates.merge(states, how="cross")

    # merge all combinations with fires
    fires = pd.merge(combinations, fires_day, how="left", on=["STATE", "DISCOVERY_DATE"])
    fires["FIRE"] = fires["FIRE"].fillna(0).astype("int")
    return fires

In [3]:
def prep_train_test_datasets(fires: pd.DataFrame, min_year_test: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    '''
    Create the train and test datasets.
    The train dataset contains features and target values (except for the target values after min_year_test).
    The test dataset contains target values after min_year_test.
    
    Input:
    fires (pd.DataFrame) : DataFrame with all fires after min_year_train
    min_year_test (int) : minimum year used for filtering for the test set (source of truth for target values)
    
    Output:
    train (pd.DataFrame) : DataFrame with features starting from min_year_train, and target values (except after min_year_test)
    test (pd.DataFrame) : DataFrame with target values after min_year_test
    dummy_submission (pd.DataFrame) : DataFrame similar to the test, to give the structure of the submission file
    '''

    # create a train set (for students)
    train = fires.copy()
    train.loc[train["DISCOVERY_DATE"] >= min_year_test, ["FIRE"]] = np.nan # replace target values for test by "NaN"

    # create a test set (source of truth)
    test = fires.loc[fires["DISCOVERY_DATE"] >= min_year_test, ['FIRE']].copy()

    # create a dummy sample for submission
    dummy_submission = test.copy()
    dummy_submission["FIRE"] = 0
    
    return train, test, dummy_submission

In [4]:
# inputs
sizes=["B", "C", "D", "E", "F", "G"]
fires = pd.read_csv("./data/1_raw/fires/fires_train.csv")
states = fires[["STATE"]].drop_duplicates()
min_date = "2011-01-01"
max_date = "2015-12-31"
min_year_test = "2015-01-01"

# get train, test and dummy_submission files
fires = define_target_value(fires, sizes, states, min_date, max_date)
train, test, dummy_submission = prep_train_test_datasets(fires, min_year_test)

# save results
train.to_csv("./data/1_raw/fires/fires_days_train.csv", index=False)
test.to_csv("./data/6_hidden/fires_days_test.csv", index=False)
dummy_submission.to_csv("./data/5_predictions/example_submission_model_2.csv", index=False)

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
