# Exploratory Data Analysis: c_summary, c_daily, i_fod, and i_ref.

This notebook explores the data given to us by our partner, Lise Ann St. Denis of the Colorado-Wyoming Resilience Engine. In reviewing the datasets, we will examine the types of data and the nubmber of records, and we will perform some initial statistical analysis.

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# drive.mount('/content/drive', force_remount=True)


In [1]:
from IPython.display import display, HTML

def show_df(df, height=400):
    """Display a scrollable HTML table of the DataFrame or a GroupBy object."""

    # If the object is a GroupBy object, first apply an aggregation (e.g., count, sum, mean)
    if isinstance(df, pd.core.groupby.DataFrameGroupBy):
        # Here we apply a count aggregation, but you can change it to other aggregations like sum, mean, etc.
        df = df.size().reset_index(name='count')

    # Display the DataFrame in a scrollable HTML format
    display(HTML(f"""
    <div style="height:{height}px; overflow:auto">
        {df.to_html(max_rows=None, max_cols=None)}
    </div>
    """))

In [None]:
#Comment these out depending on who is running the code.
#Sara's filepaths
# filepath1 = '/content/drive/MyDrive/Earth Analytics ICS209/ics209plus-wf_complex-assoc-daily_2014to2023-draft.csv'
# filepath2 = '/content/drive/MyDrive/Earth Analytics ICS209/ics209plus-wf_incidents-reference_2014to2023.csv'
# filepath3 = '/content/drive/MyDrive/Earth Analytics ICS209/ics209plus-wf-complex-summary-draft_2014to2023.csv'
# filepath4 = '/content/drive/MyDrive/Earth Analytics ICS209/ics209plus-wf-incidents-fod-draft_1999to2023.csv'


In [5]:
c_daily = pd.read_csv(filepath1)
i_ref = pd.read_csv(filepath2)
c_summary = pd.read_csv(filepath3)
i_fod = pd.read_csv(filepath4)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Earth Analytics ICS209/ics209plus-wf_complex-assoc-daily_2014to2023-draft.csv'

In [None]:
c_daily.head()
#Multiple entries per fire means we need to group by IDs to do any kind of meaningful statistical analysis or visualization.

In [None]:

c_daily.columns

In [None]:
c_daily.describe()

In [None]:
c_daily.info()

In [None]:
unique_counts = c_daily.nunique()

# Print results nicely
for column, count in unique_counts.items():
    print(f"{column}: {count} ")

In [None]:
#change the date colum to datetime format
c_daily['REPORT_TO_DATE'] = pd.to_datetime(c_daily['REPORT_TO_DATE'])

In [None]:
latest_costs = c_daily.loc[c_daily.groupby('CPLX_INCIDENT_ID')['REPORT_TO_DATE'].idxmax()]
latest_costs = latest_costs.reset_index(drop=True)

# Keep only relevant columns
latest_costs = latest_costs[['CPLX_INCIDENT_ID', 'REPORT_TO_DATE', 'EST_IM_COST_TO_DATE']]

# Show result
print(latest_costs.head())


In [None]:
incident_counts = c_daily['CPLX_INCIDENT_ID'].value_counts()
print(incident_counts.head(10))  # See the top few

print("Number of incidents with more than 1 entry:", (incident_counts > 1).sum())

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=latest_costs['EST_IM_COST_TO_DATE'])
#sns.histplot(c_daily['EST_IM_COST_TO_DATE'], bins=4)
#plt.title('Box Plot of EST_IM_COST_TO_DATE')
plt.xlabel('EST_IM_COST_TO_DATE')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=c_daily['EST_IM_COST_TO_DATE'])
#sns.histplot(c_daily['EST_IM_COST_TO_DATE'], bins=4)
plt.title('Box Plot of EST_IM_COST_TO_DATE')
plt.xlabel('EST_IM_COST_TO_DATE')
plt.grid(True)
plt.show()

In [None]:
latest = c_daily.loc[c_daily.groupby('CPLX_INCIDENT_ID')['REPORT_TO_DATE'].idxmax()]
latest_acres = latest.copy()
latest_acres = latest_acres.reset_index(drop=True)

# Keep only relevant columns
latest_acres = latest_acres[['CPLX_INCIDENT_ID', 'REPORT_TO_DATE', 'ACRES']]

# Show result
print(latest_acres.head())

plt.figure(figsize=(10, 4))
sns.boxplot(x=latest_acres['ACRES'])
#sns.histplot(c_daily['ACRES'], bins=30, kde=True)
#plt.title('Histogram of ACRES')
plt.xlabel('ACRES')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
#Top 10 biggest fires
# Sort by acres burned (in descending order)
top_10_biggest_fires = latest_acres.sort_values(by='ACRES', ascending=False).head(10)

# Show the top 10 biggest fires and their acreage
print(top_10_biggest_fires[['CPLX_INCIDENT_ID', 'ACRES']])

In [None]:
plt.figure(figsize=(10, 4))
#sns.boxplot(x=c_daily['ACRES'])
sns.histplot(latest_acres['ACRES'], bins=5)
#plt.title('Histogram of ACRES')
plt.xlabel('ACRES')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
c_daily.SUPPRESSION_STRATEGY.unique()

In [None]:
plt.figure(figsize=(10, 4))
sns.histplot(latest['SUPPRESSION_STRATEGY'], bins=30)
#plt.title('Histogram of ACRES')
#plt.xlabel('ACRES')
#plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
i_ref.head()

In [None]:
unique_counts = i_ref.nunique()

# Print results nicely
for column, count in unique_counts.items():
    print(f"{column}: {count} ")