In [1]:
import pandas as pd
import numpy as np
from utils import check_duplicates

In [2]:
checks = {True:"OK", False: "NOK"}

#### Step 1. Read data

In [3]:
# select an option to compute the distance
opt_dist = "haversine" # other possible value is "euclidean"

# keep useful columns
external_data = pd.read_csv("./data/2_clean/external_data.csv", parse_dates=["Date"])
city_names = external_data[["City_name", "Lat", "Lon"]].drop_duplicates()

# from fires
fires = pd.read_csv("./data/2_clean/fires.csv", parse_dates=["DISCOVERY_DATE", "CONT_DATE"])

  interactivity=interactivity, compiler=compiler, result=result)


### Step 2. Compute the closest state (Euclidean distance)

##### Compute the shortest state, based on latitude and longitude

#### First implementation: Euclidean distance

In [4]:

def get_cities_location(city_names: pd.DataFrame, opt_dist: str = "haversine") -> dict:
    '''
    Return a dictionary with keys corresponding to cities' names and values corresponding to location.
    For haversine, values will be a tuple (lattitude, longitude)
    For euclidean, values will be an array np.array((lattitude, longitude))
    
    Input:
    city_names (pd.DataFrame) : 
    opt_dist (str) : The desired option to compute the distance. Options are "haversine" or "euclidean".
    
    Output:
    (dict) : dictionary with keys corresponding to cities' names and values corresponding to location.
    
    '''
    if opt_dist == "haversine":
        return {key:(lat, lon) for key,lat,lon in zip(city_names["City_name"], city_names["Lat"], city_names["Lon"])}
    elif opt_dist == "euclidean":
        return {key:np.array((lat, lon)) for key,lat,lon in zip(city_names["City_name"], city_names["Lat"], city_names["Lon"])}


def find_closest_city_euclidean(x: pd.Series, cities_loc: dict) -> list:
    '''
    Find the closest city from cities_names based on the Latitude and Longitude, with the euclidean distance.
    
    Input:
    x (pd.Series) : Raw input, with Longitude and Latitude as attributes
    cities_loc (dict) : dictionary with keys corresponding to cities' names and values corresponding to location.
    
    Output:
    closest_city (str) : name of the closest city
    dist (float) : distance from the city
    
    '''
    dist = 9999999 # arbitrary large value
    closest_name = None
    point = np.array((x["LATITUDE"], x["LONGITUDE"]))
    for city, target_city in cities_loc.items():
        new_dist = np.linalg.norm(point-target_city)
        if new_dist < dist:
            closest_city = city
            dist = new_dist
    return [closest_city, dist]


def haversine(lat1:float, lon1:float, lat2:float, lon2:float, earth_radius:int = 6371, p:float = 0.017453292519943295) -> float:
    '''
    Compute the Haversine distance between two points
    
    Input:
    lat1 (float) : lattitude of point 1
    lon1 (float) : longitude of point 1
    lat2 (float) : lattitude of point 2
    lon2 (float) : longitude of point 2
    earth_radius (int) : radius of the Earth (km)
    p : value of pi / 2, to compute angles in radius
    
    Output:
    (float) : haversine distance between the points
    
    '''
    hav = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p)*np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p)) / 2
    return 2 * earth_radius * np.arcsin(np.sqrt(hav))


def find_closest_city_haversine(x: pd.Series, cities_loc: dict) -> list:
    '''
    Find the closest city from cities_names based on the Latitude and Longitude, with the haversine distance.
    
    Input:
    x (pd.Series) : Raw input, with Longitude and Latitude as attributes
    cities_loc (dict) : dictionary with keys corresponding to cities' names and values corresponding to location.
    
    Output:
    closest_city (str) : name of the closest city
    dist (float) : distance from the city
    
    '''
    dist = 9999999 # arbitrary large value
    closest_name = None
    for id, (lat2, lon2) in cities_loc.items():
        new_dist = haversine(x["LATITUDE"], x["LONGITUDE"], lat2, lon2)
        if new_dist < dist:
            closest_name = id
            dist = new_dist
    return [closest_name, dist]


def find_closest_city(fires: pd.DataFrame, opt_dist: str) -> pd.DataFrame:
    '''
    Add 2 new columns to the fires DataFrame, 1 with the name of the closest city from a given location, and 1 with the distance from this city.
    
    Input:
    fires (pd.DataFrame) : input DataFrame with all fires records
    opt_dist (str) : The desired option to compute the distance. Options are "haversine" or "euclidean".
    
    Output:
    (pd.DataFrame) : processed DataFrame
    '''
    # get a dictionnary with cities and locations
    cities_loc = get_cities_location(city_names, opt_dist)

    # around 11 mins for the execution for euclidean, and 23 mins for haversine
    if opt_dist == "euclidean":
        fires[["CLOSEST_CITY", "DISTANCE"]] = fires.apply(lambda x: find_closest_city_euclidean(x, cities_loc), axis=1, result_type="expand")
    elif opt_dist == "haversine":
        fires[["CLOSEST_CITY", "DISTANCE"]] = fires.apply(lambda x: find_closest_city_haversine(x, cities_loc), axis=1, result_type="expand")

    return fires



In [5]:
# compute closest city
fires_city = find_closest_city(fires, opt_dist)

In [6]:
# merge fires and external data
merge_data = pd.merge(fires_city, external_data, how="left", left_on=["DISCOVERY_DATE", "CLOSEST_CITY"], right_on=["Date", "City_name"])
merge_data = merge_data.drop(columns=["City_name", "Date"])

# check duplicate values
c = checks.get(check_duplicates(merge_data, ["FOD_ID"]), False)
print(f"Check duplicates: {c}")

Check duplicates: OK


In [7]:
merge_data.to_csv(f"./data/3_merge/merged_data_{opt_dist}.csv", index=False)