# Data Cleaning

### Import Libraries

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import tree, preprocessing
import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split

In [21]:
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

### Import CSV

In [22]:
fires = pd.read_csv('../data/fires.csv', low_memory=False)
fires.head()

Unnamed: 0.1,Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,LOCAL_FIRE_REPORT_ID,LOCAL_INCIDENT_ID,FIRE_CODE,FIRE_NAME,ICS_209_INCIDENT_NUMBER,ICS_209_NAME,MTBS_ID,MTBS_FIRE_NAME,COMPLEX_NAME,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,STAT_CAUSE_CODE,STAT_CAUSE_DESCR,CONT_DATE,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape
0,0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,1,PNF-47,BJ8K,FOUNTAIN,,,,,,2005,2453403.5,33,1300.0,9.0,Miscellaneous,2453403.5,33.0,1730.0,0.1,A,40.036944,-121.005833,5.0,USFS,CA,63,63.0,Plumas,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,13,13,AAC0,PIGEON,,,,,,2004,2453137.5,133,845.0,1.0,Lightning,2453137.5,133.0,1530.0,0.25,A,38.933056,-120.404444,5.0,USFS,CA,61,61.0,Placer,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,27,021,A32W,SLACK,,,,,,2004,2453156.5,152,1921.0,5.0,Debris Burning,2453156.5,152.0,2024.0,0.1,A,38.984167,-120.735556,13.0,STATE OR PRIVATE,CA,17,17.0,El Dorado,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,43,6,,DEER,,,,,,2004,2453184.5,180,1600.0,1.0,Lightning,2453189.5,185.0,1400.0,0.1,A,38.559167,-119.913333,5.0,USFS,CA,3,3.0,Alpine,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,44,7,,STEVENOT,,,,,,2004,2453184.5,180,1600.0,1.0,Lightning,2453189.5,185.0,1200.0,0.1,A,38.559167,-119.933056,5.0,USFS,CA,3,3.0,Alpine,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...


### Narrow Down the Columns

In [23]:
features = ['NWCG_REPORTING_UNIT_NAME',
            'FIRE_NAME',
            'COMPLEX_NAME', 
            'FIRE_YEAR', 
            'DISCOVERY_DATE', 
            'DISCOVERY_DOY',
            'DISCOVERY_TIME', 
            'STAT_CAUSE_DESCR', 
            'CONT_DATE',
            'CONT_DOY', 
            'CONT_TIME', 
            'FIRE_SIZE', 
            'FIRE_SIZE_CLASS', 
            'LATITUDE',
            'LONGITUDE', 
            'STATE', 
            'COUNTY']

In [24]:
df = fires[features]
df.head()

Unnamed: 0,NWCG_REPORTING_UNIT_NAME,FIRE_NAME,COMPLEX_NAME,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DATE,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY
0,Plumas National Forest,FOUNTAIN,,2005,2453403.5,33,1300.0,Miscellaneous,2453403.5,33.0,1730.0,0.1,A,40.036944,-121.005833,CA,63
1,Eldorado National Forest,PIGEON,,2004,2453137.5,133,845.0,Lightning,2453137.5,133.0,1530.0,0.25,A,38.933056,-120.404444,CA,61
2,Eldorado National Forest,SLACK,,2004,2453156.5,152,1921.0,Debris Burning,2453156.5,152.0,2024.0,0.1,A,38.984167,-120.735556,CA,17
3,Eldorado National Forest,DEER,,2004,2453184.5,180,1600.0,Lightning,2453189.5,185.0,1400.0,0.1,A,38.559167,-119.913333,CA,3
4,Eldorado National Forest,STEVENOT,,2004,2453184.5,180,1600.0,Lightning,2453189.5,185.0,1200.0,0.1,A,38.559167,-119.933056,CA,3


In [25]:
fires.shape, df.shape

((1880465, 40), (1880465, 17))

### Check the Data Types

- DISCOVERY_DATE, CONT_DATE from float64 to date/time

The DISCOVERY_DATE and CONT_DATE field are in Julian Date format. I want to convert this field to Gregorian dates (the date format we use everyday). I can then use the Gregorian date to get some additional data fields - month and day of the week.

In [26]:
df.dtypes

NWCG_REPORTING_UNIT_NAME     object
FIRE_NAME                    object
COMPLEX_NAME                 object
FIRE_YEAR                     int64
DISCOVERY_DATE              float64
DISCOVERY_DOY                 int64
DISCOVERY_TIME              float64
STAT_CAUSE_DESCR             object
CONT_DATE                   float64
CONT_DOY                    float64
CONT_TIME                   float64
FIRE_SIZE                   float64
FIRE_SIZE_CLASS              object
LATITUDE                    float64
LONGITUDE                   float64
STATE                        object
COUNTY                       object
dtype: object

In [27]:
# Convert to Date/Time format
df['discovery_date'] = pd.to_datetime(df['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
df['cont_date'] = pd.to_datetime(df['CONT_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')

# Drop old columns
df.drop(columns=['DISCOVERY_DATE', 'CONT_DATE'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['discovery_date'] = pd.to_datetime(df['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cont_date'] = pd.to_datetime(df['CONT_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [28]:
# Pull out the month and day of week from the date/time columns

df['discovery_month'] = pd.DatetimeIndex(df['discovery_date']).month
df['discovery_day_of_week'] = df['discovery_date'].dt.weekday

df['cont_month'] = pd.DatetimeIndex(df['cont_date']).month
df['cont_day_of_week'] = df['cont_date'].dt.weekday

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['discovery_month'] = pd.DatetimeIndex(df['discovery_date']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['discovery_day_of_week'] = df['discovery_date'].dt.weekday
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cont_month'] = pd.DatetimeIndex(df['cont_date']).month
A value is tryi

Unnamed: 0,NWCG_REPORTING_UNIT_NAME,FIRE_NAME,COMPLEX_NAME,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,discovery_date,cont_date,discovery_month,discovery_day_of_week,cont_month,cont_day_of_week
0,Plumas National Forest,FOUNTAIN,,2005,33,1300.0,Miscellaneous,33.0,1730.0,0.1,A,40.036944,-121.005833,CA,63,2005-02-02,2005-02-02,2,2,2.0,2.0
1,Eldorado National Forest,PIGEON,,2004,133,845.0,Lightning,133.0,1530.0,0.25,A,38.933056,-120.404444,CA,61,2004-05-12,2004-05-12,5,2,5.0,2.0
2,Eldorado National Forest,SLACK,,2004,152,1921.0,Debris Burning,152.0,2024.0,0.1,A,38.984167,-120.735556,CA,17,2004-05-31,2004-05-31,5,0,5.0,0.0
3,Eldorado National Forest,DEER,,2004,180,1600.0,Lightning,185.0,1400.0,0.1,A,38.559167,-119.913333,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0
4,Eldorado National Forest,STEVENOT,,2004,180,1600.0,Lightning,185.0,1200.0,0.1,A,38.559167,-119.933056,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0


In [30]:
day_dict = {
            0:'monday',
            1:'tuesday',
            2:'wednesday',
            3:'thursday',
            4:'friday',
            5:'saturday',
            6:'sunday'}

df['discovery_day_of_week_word'] = df['discovery_day_of_week'].map(day_dict)
df['cont_day_of_week_word'] = df['cont_day_of_week'].map(day_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['discovery_day_of_week_word'] = df['discovery_day_of_week'].map(day_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cont_day_of_week_word'] = df['cont_day_of_week'].map(day_dict)


In [31]:
df.head()

Unnamed: 0,NWCG_REPORTING_UNIT_NAME,FIRE_NAME,COMPLEX_NAME,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,STAT_CAUSE_DESCR,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,discovery_date,cont_date,discovery_month,discovery_day_of_week,cont_month,cont_day_of_week,discovery_day_of_week_word,cont_day_of_week_word
0,Plumas National Forest,FOUNTAIN,,2005,33,1300.0,Miscellaneous,33.0,1730.0,0.1,A,40.036944,-121.005833,CA,63,2005-02-02,2005-02-02,2,2,2.0,2.0,wednesday,wednesday
1,Eldorado National Forest,PIGEON,,2004,133,845.0,Lightning,133.0,1530.0,0.25,A,38.933056,-120.404444,CA,61,2004-05-12,2004-05-12,5,2,5.0,2.0,wednesday,wednesday
2,Eldorado National Forest,SLACK,,2004,152,1921.0,Debris Burning,152.0,2024.0,0.1,A,38.984167,-120.735556,CA,17,2004-05-31,2004-05-31,5,0,5.0,0.0,monday,monday
3,Eldorado National Forest,DEER,,2004,180,1600.0,Lightning,185.0,1400.0,0.1,A,38.559167,-119.913333,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0,monday,saturday
4,Eldorado National Forest,STEVENOT,,2004,180,1600.0,Lightning,185.0,1200.0,0.1,A,38.559167,-119.933056,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0,monday,saturday


Lowercase the column titles

In [32]:
df.columns = df.columns.str.lower()

In [33]:
df.head()

Unnamed: 0,nwcg_reporting_unit_name,fire_name,complex_name,fire_year,discovery_doy,discovery_time,stat_cause_descr,cont_doy,cont_time,fire_size,fire_size_class,latitude,longitude,state,county,discovery_date,cont_date,discovery_month,discovery_day_of_week,cont_month,cont_day_of_week,discovery_day_of_week_word,cont_day_of_week_word
0,Plumas National Forest,FOUNTAIN,,2005,33,1300.0,Miscellaneous,33.0,1730.0,0.1,A,40.036944,-121.005833,CA,63,2005-02-02,2005-02-02,2,2,2.0,2.0,wednesday,wednesday
1,Eldorado National Forest,PIGEON,,2004,133,845.0,Lightning,133.0,1530.0,0.25,A,38.933056,-120.404444,CA,61,2004-05-12,2004-05-12,5,2,5.0,2.0,wednesday,wednesday
2,Eldorado National Forest,SLACK,,2004,152,1921.0,Debris Burning,152.0,2024.0,0.1,A,38.984167,-120.735556,CA,17,2004-05-31,2004-05-31,5,0,5.0,0.0,monday,monday
3,Eldorado National Forest,DEER,,2004,180,1600.0,Lightning,185.0,1400.0,0.1,A,38.559167,-119.913333,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0,monday,saturday
4,Eldorado National Forest,STEVENOT,,2004,180,1600.0,Lightning,185.0,1200.0,0.1,A,38.559167,-119.933056,CA,3,2004-06-28,2004-07-03,6,0,7.0,5.0,monday,saturday


In [34]:
df.dtypes

nwcg_reporting_unit_name              object
fire_name                             object
complex_name                          object
fire_year                              int64
discovery_doy                          int64
discovery_time                       float64
stat_cause_descr                      object
cont_doy                             float64
cont_time                            float64
fire_size                            float64
fire_size_class                       object
latitude                             float64
longitude                            float64
state                                 object
county                                object
discovery_date                datetime64[ns]
cont_date                     datetime64[ns]
discovery_month                        int64
discovery_day_of_week                  int64
cont_month                           float64
cont_day_of_week                     float64
discovery_day_of_week_word            object
cont_day_o

In [35]:
df.isna().sum()

nwcg_reporting_unit_name            0
fire_name                      960479
complex_name                  1875282
fire_year                           0
discovery_doy                       0
discovery_time                 882638
stat_cause_descr                    0
cont_doy                       891531
cont_time                      972553
fire_size                           0
fire_size_class                     0
latitude                            0
longitude                           0
state                               0
county                         678148
discovery_date                      0
cont_date                      891531
discovery_month                     0
discovery_day_of_week               0
cont_month                     891531
cont_day_of_week               891531
discovery_day_of_week_word          0
cont_day_of_week_word          891531
dtype: int64

# Save Dataframes to CSV Files

In [36]:
# df.to_csv('../data/clean_copy.csv')

In [38]:
df_CA = df[df['state']=='CA']
df_CA.head(3)

Unnamed: 0,nwcg_reporting_unit_name,fire_name,complex_name,fire_year,discovery_doy,discovery_time,stat_cause_descr,cont_doy,cont_time,fire_size,fire_size_class,latitude,longitude,state,county,discovery_date,cont_date,discovery_month,discovery_day_of_week,cont_month,cont_day_of_week,discovery_day_of_week_word,cont_day_of_week_word
0,Plumas National Forest,FOUNTAIN,,2005,33,1300.0,Miscellaneous,33.0,1730.0,0.1,A,40.036944,-121.005833,CA,63,2005-02-02,2005-02-02,2,2,2.0,2.0,wednesday,wednesday
1,Eldorado National Forest,PIGEON,,2004,133,845.0,Lightning,133.0,1530.0,0.25,A,38.933056,-120.404444,CA,61,2004-05-12,2004-05-12,5,2,5.0,2.0,wednesday,wednesday
2,Eldorado National Forest,SLACK,,2004,152,1921.0,Debris Burning,152.0,2024.0,0.1,A,38.984167,-120.735556,CA,17,2004-05-31,2004-05-31,5,0,5.0,0.0,monday,monday


In [39]:
# df_CA.to_csv('../data/california.csv')