# This is the 6th notebook
- To fill null by distance weighted average with multi process

In [1]:
# data maniuplation
import pandas as pd
import numpy as np
from itertools import product
# calculate distance from GCS data (latitude and longitude)
from geopy.distance import geodesic
# multi process
from multiprocessing import Pool
# Import the function inside .py
from fill_nulls_multiprocess import process_date_subset  

In [2]:
# read csv that mapped region to district
df = pd.read_csv("data_gcs_combine_update_district.csv", index_col=0)

In [3]:
def precompute_distances_with_geopy(stations, threshold):
    """
    Precompute the pairwise distances and weights (1/distance^2) between stations,
    filter pairs based on the distance threshold, and remove self-pairs.

    Parameters:
        stations (pd.DataFrame): DataFrame containing 'region', 'latitude', and 'longitude'.
        threshold (float): Distance threshold in kilometers.

    Returns:
        pd.DataFrame: Filtered distance matrix with columns ['station1', 'station2', 'distance', 'weight'].
    """
    

    # Create all station pairs
    station_pairs = pd.DataFrame(list(product(stations['region'], repeat=2)), columns=['station1', 'station2'])

    # Remove self-pairs
    station_pairs = station_pairs[station_pairs['station1'] != station_pairs['station2']]

    # Create a mapping for coordinates
    station_coords = stations.set_index('region')[['latitude', 'longitude']]

    # Calculate geodesic distances
    station_pairs['distance'] = station_pairs.apply(
        lambda row: geodesic(
            station_coords.loc[row['station1']],
            station_coords.loc[row['station2']]
        ).km, axis=1
    )

    # Filter pairs by the distance threshold
    station_pairs = station_pairs[station_pairs['distance'] <= threshold]

    # Calculate weights (1/distance^2)
    station_pairs['weight'] = station_pairs['distance'].apply(lambda d: 1 / (d ** 2))

    return station_pairs

In [4]:
def parallel_fill_nulls(df, fields, distance_matrix, n_processes=4):
    """
    Fill null values using multiprocessing by splitting the DataFrame by date.
    """
    df_filled = df.copy()
    dates = df_filled['date'].unique()

    # Prepare arguments for multiprocessing
    args = [(date, df_filled[df_filled['date'] == date], fields, distance_matrix) for date in dates]

    # Use multiprocessing
    with Pool(n_processes) as pool:
        results = pool.map(process_date_subset, args)

    # Combine results
    df_filled = pd.concat(results, ignore_index=True)
    return df_filled

In [5]:
# get the unique row of region, latitude, longitude
stations = df[['region', 'latitude', 'longitude']].drop_duplicates()

In [6]:
# define the series to fill (exclude if the series is available in one station only )
fields_to_fill = [
    'Mean Pressure (hPa)', 
    'Total Rainfall (mm)', 
    'Mean Relative Humidity (%)', 
    'Maximum Temperature (°C)', 
    'Minimum Temperature (°C)', 
    'Mean Temperature (°C)', 
    'Prevailing Wind Direction (°)', 
    'Mean Wind Speed (km/h)']

In [7]:
# Define a distance threshold
threshold = 30  

In [8]:
# get helper dataframe with the weighting after calculated the 1/distanct^2 and passed threshold
distance_matrix = precompute_distances_with_geopy(stations, threshold)

In [9]:
# fill null with multi process 
filled_df = parallel_fill_nulls(df, fields_to_fill, distance_matrix, n_processes=4)

In [10]:
# save it in csv
filled_df.to_csv("data_gcs_combine_update_district_fillna.csv")