In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from utils import check_duplicates

In [2]:
checks = {True:"OK", False: "NOK"}

### Step 1. Data Cleaning

Fires: Table including wildfire data for the period of 1992-2015 compiled from US federal, state, and local reporting systems.

#### Identifier
* [x] FOD_ID = Global unique identifier.

#### Fire
* [x] FIRE_CODE = Code used within the interagency wildland fire community to track and compile cost information for emergency fire suppression (https://www.firecode.gov/).
* [x] FIRE_NAME = Name of the incident, from the fire report (primary) or ICS-209 report (secondary).
* [x] FIRE_SIZE = Estimate of acres within the final perimeter of the fire.
* [x] FIRESIZECLASS = Code for fire size based on the number of acres within the final fire perimeter expenditures (A=greater than 0 but less than or equal to 0.25 acres, B=0.26-9.9 acres, C=10.0-99.9 acres, D=100-299 acres, E=300 to 999 acres, F=1000 to 4999 acres, and G=5000+ acres).
* ICS209INCIDENT_NUMBER = Incident (event) identifier, from the ICS-209 report.
* ICS209NAME = Name of the incident, from the ICS-209 report.
* MTBS_ID = Incident identifier, from the MTBS perimeter dataset.
* MTBSFIRENAME = Name of the incident, from the MTBS perimeter dataset.
* COMPLEX_NAME = Name of the complex under which the fire was ultimately managed, when discernible.

#### Dates
* [x] FIRE_YEAR = Calendar year in which the fire was discovered or confirmed to exist.
* [x] DISCOVERY_DATE = Date on which the fire was discovered or confirmed to exist.
* [x] DISCOVERY_TIME = Time of day that the fire was discovered or confirmed to exist.
* [x] CONT_DATE = Date on which the fire was declared contained or otherwise controlled (mm/dd/yyyy where mm=month, dd=day, and yyyy=year).
* [x] CONT_TIME = Time of day that the fire was declared contained or otherwise controlled (hhmm where hh=hour, mm=minutes).
* DISCOVERY_DOY = Day of year on which the fire was discovered or confirmed to exist.
* CONT_DOY = Day of year on which the fire was declared contained or otherwise controlled.


#### Place
* [x] LATITUDE = Latitude (NAD83) for point location of the fire (decimal degrees).
* [x] LONGITUDE = Longitude (NAD83) for point location of the fire (decimal degrees).
* [x] STATE = Two-letter alphabetic code for the state in which the fire burned (or originated), based on the nominal designation in the fire report.
* [x] COUNTY = County, or equivalent, in which the fire burned (or originated), based on nominal designation in the fire report.
* [x] FIPS_CODE = Three-digit code from the Federal Information Process Standards (FIPS) publication 6-4 for representation of counties and equivalent entities.
* [x] FIPS_NAME = County name from the FIPS publication 6-4 for representation of counties and equivalent entities.

#### Cause
* [x] CAUSE_CODE = Code for the cause of the fire.
* [x] CAUSE_DESCR = Description of the cause of the fire.

#### Source & Reporting
* SOURCESYSTEMTYPE = Type of source database or system that the record was drawn from (federal, nonfederal, or interagency).
* SOURCESYSTEM = Name of or other identifier for source database or system that the record was drawn from. See Table 1 in Short (2014), or \Supplements\FPAFODsourcelist.pdf, for a list of sources and their identifier.
* NWCGREPORTINGAGENCY = Active National Wildlife Coordinating Group (NWCG) Unit Identifier for the agency preparing the fire report (BIA = Bureau of Indian Affairs, BLM = Bureau of Land Management, BOR = Bureau of Reclamation, DOD = Department of Defense, DOE = Department of Energy, FS = Forest Service, FWS = Fish and Wildlife Service, IA = Interagency Organization, NPS = National Park Service, ST/C&L = State, County, or Local Organization, and TRIBE = Tribal Organization).
* NWCGREPORTINGUNIT_ID = Active NWCG Unit Identifier for the unit preparing the fire report.
* NWCGREPORTINGUNIT_NAME = Active NWCG Unit Name for the unit preparing the fire report.
* SOURCEREPORTINGUNIT = Code for the agency unit preparing the fire report, based on code/name in the source dataset.
* SOURCEREPORTINGUNIT_NAME = Name of reporting agency unit preparing the fire report, based on code/name in the source dataset.
* LOCALFIREREPORT_ID = Number or code that uniquely identifies an incident report for a particular reporting unit and a particular calendar year.
* LOCALINCIDENTID = Number or code that uniquely identifies an incident for a particular local fire management organization within a particular calendar year.
* OWNER_CODE = Code for primary owner or entity responsible for managing the land at the point of origin of the fire at the time of the incident.
* OWNER_DESCR = Name of primary owner or entity responsible for managing the land at the point of origin of the fire at the time of the incident.

In [3]:
def convert_datetime(x: pd.Series, opt: str) -> datetime:
    '''
    Create a datetime column for a DataFrame, based on dates and times. 

    Input:
    x (pd.Series): row of the input DataFrame
    opt (str): options for the columns name. Either "DISCOVERY" or "CONT".

    Output:
    (datetime): output datetime
    '''
    if (not np.isnan(x[opt + "_TIME"])) & (not pd.isnull(x[opt + "_DATE"])):
        t = str(int(x[opt + "_TIME"])).rjust(4,"0")
        d = x[opt + "_DATE"].strftime("%Y-%m-%d")
        dt = datetime.strptime(f"{d} {t[:2]}:{t[2:]}", "%Y-%m-%d %H:%M")
        return dt

def cleaning_fires(fires: pd.DataFrame, cols:list) -> pd.DataFrame:
    '''
    Clean the input dataframe, by converting dates and selecting columns

    Input:
    fires (pd.DataFrame): input DataFrame
    cols (list): list of columns to keep

    Output:
    (pd.DataFrame): cleaned DataFrame
    '''
    # select useful columns
    fires = fires.loc[:,cols]

    # convert dates from Julian to Gregorian format
    for c in ["DISCOVERY_DATE", "CONT_DATE"]:
        fires[c] = pd.to_datetime(fires[c] - pd.Timestamp(0).to_julian_date(), unit='D')

    # convert time if available
    for option in ["DISCOVERY", "CONT"]:
        fires[option + "_TIME"] = fires.apply(lambda x: convert_datetime(x, option), axis=1)
    
    return fires

def feature_engineering_fires(fires: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from the Fires dataset.

    Input:
    fires (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with additional features
    '''
    # To Be Completed
    fires["DURATION"] = (fires["CONT_TIME"] - fires["DISCOVERY_TIME"]).dt.total_seconds()
    
    return fires

In [4]:
cols = [
    'FOD_ID', 'FIRE_YEAR', 'DISCOVERY_DATE', 'DISCOVERY_TIME',
    'CONT_DATE', 'CONT_TIME', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
    'LATITUDE', 'LONGITUDE', 'STATE', 'CAUSE_CODE', 'CAUSE_DESCR'
]

# cleaning and feature engineering
fires = pd.read_csv("./data/1_raw/fires/fires_train.csv")
fires = cleaning_fires(fires, cols) # clean file
fires = feature_engineering_fires(fires) # feature_engineering

# check duplicate values
c = checks.get(check_duplicates(fires, ["FOD_ID"]), False)
print(f"Check duplicates: {c}")

# save to csv
fires.to_csv("./data/2_clean/fires.csv", index=False)

  interactivity=interactivity, compiler=compiler, result=result)


Check duplicates: OK
