# Data Preparation

### In this notebook, we transform the data into the required format for our latter prediction analysis.

### Install some required packages

In [None]:
!pip install s3fs
!pip install natsort
!pip install geopy

### Import packages

In [None]:
import os
import pandas as pd, numpy as np
import random
from scipy import stats
from natsort import natsorted
from collections import Counter

import datetime
from datetime import date, timedelta
import time

from geopy.distance import geodesic
from sklearn.metrics import pairwise_distances

import warnings

warnings.filterwarnings("ignore")

### Global variables

In [None]:
RESULTS = "../results"

### Utility functions

In [None]:
def custom_distance(a, b, unit="km"):
    """
    Calculates the geodesic distance between the two points a and b using the geodesic distance.

    Arguments:
        a: a point defined by its latitude and longitude (lat_a, lon_a)
        b: a point defined by its latitude and longitude (lat_b, lon_b)
        unit: the desired unit of the the calculated distance (defaault: km)

    return: the geodesic distance between a and b in the desired unit.
    """
    try:
        if unit == "km":
            return geodesic(a, b).km
        elif unit == "miles":
            return geodesic(a, b).miles
        elif unit == "m":
            return geodesic(a, b).m
    except:
        raise ValueError(
            "Points must be defined as tuple or list: (latitude, longitude) or [latitude, longitude]!"
        )

In [None]:
def retain_stations_per_fire(
    fires_names,
    stations_names,
    fires_stations_distances,
    proximity=10.0,
    num_stations_retained=2,
):
    """
    Gives the list of stations to be considered for each fire.

    Arguments:
        fires_names: list of the fires names
        stations_names: list of the stations names
        fires_stations_distances: array containing the distances between the fires and the stations (in km)
        proximity: the maximum distance a station must be from a fire to be considered (in km)

    return: the dictionary containing the fires with their lists of stations to be considered.
    """

    # instantiate an empty dictionary
    fire_stations = {}

    # loop over the fires
    for idx in range(len(fires_stations_distances)):
        # get the ids of the stations far from less than 10 kms
        stations_ids = np.nonzero(fires_stations_distances[idx] <= proximity)[0]

        # if there there are some "close" stations (10 kms around)
        if len(stations_ids) >= num_stations_retained:
            # sort the stations ids by ascending distance from the considered fire
            pos = np.argsort(fires_stations_distances[idx][stations_ids])
            stations_ids = stations_ids[pos][:num_stations_retained]
            # get the fire and associated stations names
            fire = fires_names[idx]
            stations = [stations_names[i] for i in stations_ids]
            # finally, append the fire and its associated stations to the dictionary
            fire_stations[fire] = stations

    return fire_stations

In [None]:
def get_station_data_from_fire_dates(fire, station):
    """
    Get the data of the given station for the period when the given fire happened.

    Arguments:
        fire: the name of the fire to be considered
        station: the name of the station to be considered

    Return:
        A filtered dataframe containing the station data.
    """

    df_fires = fire_data[fire_data["Fire"] == fire].copy()
    # df_fires['StartedDate'] = pd.to_datetime(df_fires['StartedDate'], unit="ns", utc=True)
    # df_fires['EndedDate'] = pd.to_datetime(df_fires['EndedDate'], unit="ns", utc=True)
    df_fires["EndedDate"] = df_fires["EndedDate"].apply(
        lambda x: x + datetime.timedelta(days=1)
    )
    try:
        df_fires["EndedDate"] = df_fires["EndedDate"].dt.tz_localize("UTC")
    except:
        df_fires["EndedDate"] = df_fires["EndedDate"].dt.tz_convert("UTC")

    start_date, end_date = df_fires.iloc[0, 2], df_fires.iloc[0, 3]
    start_year, end_year = start_date.year, end_date.year
    start_month, end_month = start_date.month, end_date.month
    start_day, end_day = start_date.day, end_date.day
    years = list(set([start_year, end_year]))

    stations_dic = {
        "2016": data_2016,
        "2017": data_2017,
        "2018": data_2018,
        "2019": data_2019,
        "2020": data_2020,
        "2021": data_2020,
    }

    df_station = None

    # select the fire had started and ended the same year
    if len(years) < 2:
        df_station_start = stations_dic[str(start_year)].copy()
        df_station = df_station_start[df_station_start["Station"] == station]

    else:
        df_station_end = stations_dic[str(end_year)].copy()
        df_station = pd.concat(
            [df_station, df_station_end[df_station_end["Station"] == station]]
        )

    # filter the data on the periode when the fire had happened
    # we consider that the fire had not started before 12PM (we consider only natural cause, so when the fire starts because of the heat)
    df_station["datetime"] = df_station["datetime"].apply(pd.to_datetime)
    df_station = df_station[
        (df_station["datetime"].dt.date >= start_date)
        & (df_station["datetime"].dt.date <= end_date)
    ]
    df_station = df_station.sort_values("datetime").reset_index(drop=True)
    df_station = df_station.iloc[12:].reset_index(drop=True)

    return df_station

In [None]:
def add_fire_crticity(dataframe, fire_info, acres_burnt_threshold=10000):
    """
    Add the number of acres burnt by the fires, and their associated criticity (fire category) with regard to the considered threshold:

        - Fires that burn more than the considered threshold of acres burnt are considered critical (class 1)
        - Fires that burn less than the considered threshold of acres burnt are considered non-critical (class -1)

    Arguments:
        dataframe: dataframe with the information of the fires and their associated stations
        fire_info: dictionary containing the number of acres burnt and the category of each fire
        acres_burnt_threshold: the considered threshold of acres burnt

    Return:
        The dataframe updated with the information of the number of acres burnt and the associated category of the fire at this moment, for each row.
    """

    def multiply_values(a, b):
        return a * b

    cols = list(dataframe.columns) + ["acres_burnt", "category"]
    df = pd.DataFrame()
    fires = natsorted(list(set(dataframe["fire"])))

    for fire in fires:
        temp_df = (
            dataframe[dataframe["fire"] == fire]
            .drop_duplicates(subset=["fire", "station", "datetime"])
            .sort_values("datetime")
        )
        # wind speeds registererd during the same period
        # we normalize them by dividing by the mean
        wind_speed = [
            np.mean([speed_station_1, speed_station_2]) / temp_df["wind_speed"].mean()
            for speed_station_1, speed_station_2 in zip(
                list(temp_df[temp_df["station"] == "station_1"]["wind_speed"]),
                list(temp_df[temp_df["station"] == "station_2"]["wind_speed"]),
            )
        ]
        wind_speed = wind_speed[1:]

        # total surface burnt
        acres_burnt = fire_info[fire]["acres_burnt"]
        # average surface burnt
        # we consider the half of the lenght of the reduced dataset because there are two stations
        # and we remove the first values as it corresponds to the begining of the fire
        mean_acres_burnt_per_hour = acres_burnt / (
            len(temp_df[temp_df["station"] == "station_1"]) - 1
        )
        # approximate surface burnt per hour
        # we multily the average surface burnt by the weighted wind speed per each row
        acres_burnt_per_hour = [mean_acres_burnt_per_hour] * (
            len(temp_df[temp_df["station"] == "station_1"]) - 1
        )
        acres_burnt_per_hour = [
            acres_burnt_per_hour[idx] * wind_speed[idx]
            for idx in range(len(acres_burnt_per_hour))
        ]
        acres_burnt_per_hour = np.cumsum(acres_burnt_per_hour)

        for station in ["station_1", "station_2"]:
            # we drop the first row at it corresponds to the begining of the fire (time step 0)
            temp_df_station = temp_df[temp_df["station"] == station][1:].reset_index(
                drop=True
            )

            # add the number of acres burnt per hour to the existing dataframe
            temp_df_station["acres_burnt"] = acres_burnt_per_hour
            # add the criticity of the fire at each time step
            temp_df_station["category"] = temp_df_station["acres_burnt"].apply(
                lambda x: "-1" if x < acres_burnt_threshold else "1"
            )
            # append to the final output
            df = pd.concat([df, temp_df_station[cols]])

    # format the final output
    df = df[cols].sort_values(["fire", "station", "datetime"]).reset_index(drop=True)

    return df

In [None]:
def add_fire_duration(dataframe):
    """
    Add the elapse time since the begin of the fire until the current time step for each row.

    Arguments:
        dataframe: dataframe with the information of the fires and their associated stations

    Return:
        The dataframe updated with the information of the elapse time since the beginin of the fire.
    """

    cols = [c for c in dataframe.columns if c != "category"] + [
        "duration_in_hours",
        "category",
    ]
    df = pd.DataFrame()
    fires = natsorted(list(set(dataframe["fire"])))

    for fire in fires:
        for station in ["station_1", "station_2"]:
            temp_df = (
                dataframe[
                    (dataframe["fire"] == fire) & (dataframe["station"] == station)
                ]
                .drop_duplicates(subset=["fire", "station", "datetime"])
                .sort_values("datetime")
            )
            temp_df["duration_in_hours"] = np.arange(1, len(temp_df) + 1)
            df = pd.concat([df, temp_df[cols].reset_index(drop=True)])

    df = df[cols].sort_values(["fire", "station", "datetime"]).reset_index(drop=True)

    return df

# Load and explore the data

#### Solar data

In [None]:
data_2016 = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_2016_California_20UTC_GHI.feather"
)
data_2016["datetime"] = data_2016["datetime"].dt.tz_localize("UTC")

print(f"Data size: {len(data_2016)}\n\n")
data_2016.head(2)

In [None]:
data_2017 = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_2017_California_20UTC_GHI.feather"
)
data_2017["datetime"] = data_2017["datetime"].dt.tz_localize("UTC")

print(f"Data size: {len(data_2017)}\n\n")
data_2017.head(2)

In [None]:
data_2018 = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_2018_California_20UTC_GHI.feather"
)
data_2018["datetime"] = data_2018["datetime"].dt.tz_localize("UTC")

print(f"Data size: {len(data_2018)}\n\n")
data_2018.head(2)

In [None]:
data_2019 = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_2019_California_20UTC_GHI.feather"
)
data_2019["datetime"] = data_2019["datetime"].dt.tz_localize("UTC")

print(f"Data size: {len(data_2019)}\n\n")
data_2019.head(2)

In [None]:
data_2020 = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_2020_California_20UTC_GHI.feather"
)
data_2020["datetime"] = data_2020["datetime"].dt.tz_convert("UTC")

print(f"Data size: {len(data_2020)}\n\n")
data_2020.head(2)

### Stations data

In [None]:
stations = pd.read_feather(
    "s3://data.atoti.io/notebooks/ca-solar/nsrdb_station_lat_lon.feather"
)

print(f"Data size: {len(stations)}\n\n")
stations.head(2)

### Fires data

In [None]:
fire_data = pd.read_feather("s3://data.atoti.io/notebooks/ca-solar/fire_data.feather")
fire_data["StartedDate"] = (
    fire_data["StartedDate"].apply(pd.to_datetime).dt.tz_localize("UTC")
)
fire_data["EndedDate"] = (
    fire_data["EndedDate"].apply(pd.to_datetime).dt.tz_localize("UTC")
)

print(f"Data size: {len(fire_data)}\n\n")
fire_data.head(2)

In [None]:
fire_loc = pd.read_feather("s3://data.atoti.io/notebooks/ca-solar/fire_loc.feather")

print(f"Data size: {len(fire_loc)}\n\n")
fire_loc.head(2)

## Station to fire proximity

#### Running this cell could take many minutes...

In [None]:
%%time
X = fire_loc[["Latitude", "Longitude"]]
Y = stations[["Latitude", "Longitude"]]

distances = pairwise_distances(X, Y, metric=custom_distance, n_jobs=-1)

print(f"Distance matrix size: {distances.shape}")

### Save the distances

In [None]:
distances_df = pd.DataFrame(
    distances, index=list(fire_loc["Fire"]), columns=list(stations["Station"])
)
distances_df.to_csv(os.path.join(RESULTS, "fires-stations-distances.csv"))

## Filter the stations
#### For each fire, we consider:
#### - Only the stations located under a certain distance (let's consider this distance to be 10 kms here);
#### - Only the two closest stations

In [None]:
fires_names = natsorted(fire_loc.Fire.tolist())
stations_names = natsorted(stations.Station.tolist())
proximity = 10
num_stations_retained = 2

fire_stations = retain_stations_per_fire(
    fires_names, stations_names, distances, proximity, num_stations_retained
)

print(
    f"Number of fires retained (with stations far from less than {proximity} km): {len(fire_stations)}"
)

## Filter the stations data

#### For each fire, for each retained station (close to the fire), we consider only the data correspind to the period when the fire happened.

#### For example, if a fire lasted 2 days, we would consider the data of the concerned stations only for these 2 days.

#### Running this cell could take many minutes...

In [None]:
%%time
stations = Parallel(n_jobs=-1, prefer="threads", temp_folder=RESULTS)(
    delayed(get_station_data_from_fire_dates)(fire, station)
    for fire in fires_samples
    for station in fire_stations[fire]
)

### Create the dataframe with the selected fires and associated stations

#### Running this cell could take many minutes...

In [None]:
%%time

fires = natsorted(list(fire_stations.keys()))
df = pd.DataFrame()
lenghts = []  # Number of rows for each fire

for fire in fires:
    for idx in range(len(fire_stations[fire])):  # for station in fire_stations[fire]:
        d = get_station_data_from_fire_dates(
            fire, fire_stations[fire][idx]
        )  # d = get_station_data_from_fire_dates(fire, station)
        d["fire"] = [fire] * len(d)
        d["station"] = ["station_" + str(idx + 1)] * len(
            d
        )  # d['station'] = [station] * len(d)
        df = pd.concat([df, d])
        lenghts.append(len(d))

first_cols = ["fire", "station"]
cols = first_cols + [c for c in df.columns if c not in first_cols and c != "Station"]
df = df[cols].reset_index(drop=True)

print(f"Size of the filtered dataset: {df.shape}")

### Save the filtered dataset

In [None]:
df.to_csv(os.path.join(RESULTS, "fires-stations-filtered-dataset.csv"), index=False)

### Number of stations for each fire

In [None]:
num_stations_per_fire = {
    fire: len(fire_stations[fire]) for fire in list(fire_stations.keys())
}

In [None]:
quantiles = list(np.arange(0, 1.1, 0.1))
values = list(np.quantile(list(num_stations_per_fire.values()), quantiles))

for quantile, value in zip(quantiles, values):
    print(f"Quantile {int(quantile*100)}%: {int(value)}\n")

In [None]:
dic = dict(Counter(list(num_stations_per_fire.values())))
dic

So, we have retained 1215 stations fires, and exactly 2 "clsoe" stations for each of them.

### Duration of the fires

In [None]:
fire_duration = list(
    fire_data["EndedDate"].apply(pd.to_datetime)
    - fire_data["StartedDate"].apply(pd.to_datetime)
)
fire_duration = [duration.days for duration in fire_duration]

values = list(np.quantile(fire_duration, quantiles))

print("Distribution of the durations of the fires (in days):\n")
for quantile, value in zip(quantiles, values):
    print(f"Quantile {int(quantile*100)}%: {int(value)}\n")

### Reload the fires and stations data

In [None]:
df = pd.read_csv(os.path.join(RESULTS, "fires-stations-filtered-dataset.csv"))

print(f"Size of the data: {df.shape}\n\n")
df.head()

In [None]:
accres_burnt = list(fire_data["AcresBurned"])
quantiles = list(np.arange(0, 1.01, 0.01))
values = list(np.quantile(accres_burnt, quantiles))

print("Distribution of the acres burnt:\n")
for quantile, value in zip(quantiles, values):
    print(f"Quantile {int(quantile*100)}%: {int(value)}\n")

## Hypotheses:

#### We consider:
#### - The fires with surrounding stations far from at most 10 kms;
#### - The fires with at least two stations within 10 kms distance;
#### - The fires that lasted at most 120 days (4 months)
#### - Two sattions excatly, for each fire retained. We choose the two closest ones in case there are more than two stations;
#### - The threshold of 10 000 acres burnt to categorize the fires: class -1 corresponds to fires that burnt less than 10 000 acres, class 1 to the others.

### Filter the fires and add their category

In [None]:
max_duration = 120  # 4 months
acres_burnt_threshold = 10000

fires_retained = fire_data.copy()
fires_retained["EndedDate"] = fires_retained["EndedDate"].apply(pd.to_datetime)
fires_retained["StartedDate"] = fires_retained["StartedDate"].apply(pd.to_datetime)
fires_retained["duration_in_days"] = list(
    fires_retained["EndedDate"] - fires_retained["StartedDate"]
)
fires_retained["duration_in_days"] = [
    duration.days for duration in list(fires_retained["duration_in_days"])
]
fires_retained = fires_retained[fires_retained["duration_in_days"] <= max_duration]
fires_retained["category"] = fires_retained["AcresBurned"].apply(
    lambda x: "-1" if x < acres_burnt_threshold else "1"
)
fires_retained = fires_retained[fires_retained["Fire"].isin(list(fire_stations.keys()))]

print(f"Number of fires retained: {len(fires_retained)}\n\n")
fires_retained.head()

In [None]:
acres_burnt_and_caterogies = {
    fire: {"acres_burnt": acres_burnt, "category": category}
    for fire, acres_burnt, category in zip(
        list(fires_retained["Fire"]),
        list(fires_retained["AcresBurned"]),
        list(fires_retained["category"]),
    )
}
for k in random.sample(list(acres_burnt_and_caterogies.keys()), 5):
    print(f"{k}: {acres_burnt_and_caterogies[k]}")

### Add the number of acres burnt and the category of the fire for each row

In [None]:
%%time
df = df[df["fire"].isin(set(fires_retained["Fire"]))]
df = add_fire_crticity(df, acres_burnt_and_caterogies)
df = add_fire_duration(df)

print(f"Size of the data: {len(df)}\n\n")
df.head()

In [None]:
df.tail()

In [None]:
df["category"].value_counts()

In [None]:
df["category"].value_counts(normalize=True)

## Save the final dataset

In [None]:
df.to_csv(os.path.join(RESULTS, "fires-stations-final-dataset.csv"), index=False)