In [61]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from itertools import product

In [2]:
df = pd.read_csv("data_gcs_combine_update_district.csv", index_col=0)

In [3]:
df['date'] = pd.to_datetime(df['date'])

In [41]:
def precompute_distances_and_weights_with_threshold(stations, threshold):
    """
    Precompute the pairwise distances and weights (1/distance^2) between stations,
    filter pairs based on the distance threshold, and remove self-pairs.

    Parameters:
        stations (pd.DataFrame): DataFrame containing 'region', 'latitude', and 'longitude'.
        threshold (float): Distance threshold in kilometers.

    Returns:
        pd.DataFrame: Filtered distance matrix with columns ['station1', 'station2', 'distance', 'weight'].
    """
    from itertools import product

    # Create all station pairs
    station_pairs = pd.DataFrame(list(product(stations['region'], repeat=2)), columns=['station1', 'station2'])

    # Remove self-pairs
    station_pairs = station_pairs[station_pairs['station1'] != station_pairs['station2']]

    # Calculate distances for all pairs
    station_coords = stations.set_index('region')[['latitude', 'longitude']]
    station_pairs['distance'] = station_pairs.apply(
        lambda row: geodesic(
            station_coords.loc[row['station1']],
            station_coords.loc[row['station2']]
        ).km, axis=1
    )

    # Filter by distance threshold
    station_pairs = station_pairs[station_pairs['distance'] <= threshold]

    # Calculate weights (1/distance^2)
    station_pairs['weight'] = station_pairs['distance'].apply(lambda d: 1 / (d ** 2))

    return station_pairs


def fill_nulls_with_weighted_average(df, fields, distance_matrix):
    """
    Fill null values in specific fields using a precomputed distance matrix with weights.

    Parameters:
        df (pd.DataFrame): Input DataFrame with 'date', 'region', and the fields to fill.
        fields (list): List of column names of the fields with missing values to fill.
        distance_matrix (pd.DataFrame): Precomputed distance matrix with filtered pairs and weights.

    Returns:
        pd.DataFrame: Modified copy of the input DataFrame with null values filled.
    """
    # Make a copy of the DataFrame to avoid modifying the original
    df_filled = df.copy()

    # Iterate over unique dates
    for date in df_filled['date'].unique():
        # Subset data for the specific date
        daily_data = df_filled[df_filled['date'] == date]

        for field in fields:
            stations_with_data = daily_data[daily_data[field].notnull()]
            stations_without_data = daily_data[daily_data[field].isnull()]

            # Loop through stations without data
            for idx, row in stations_without_data.iterrows():
                region = row['region']

                # Retrieve precomputed distances and weights
                valid_distances = distance_matrix[
                    distance_matrix['station1'] == region
                ].merge(stations_with_data[['region', field]], left_on='station2', right_on='region')

                if not valid_distances.empty:
                    # Calculate weighted average using precomputed weights
                    weighted_avg = np.sum(valid_distances['weight'] * valid_distances[field]) / np.sum(valid_distances['weight'])
                    df_filled.loc[idx, field] = weighted_avg

    return df_filled

In [12]:
stations = df[['region', 'latitude', 'longitude']].drop_duplicates()

In [15]:
fields_to_fill = ['Mean Pressure (hPa)', 'Total Rainfall (mm)', 'Mean Relative Humidity (%)', 'Maximum Temperature (°C)', 'Minimum Temperature (°C)', 'Mean Temperature (°C)', 'Prevailing Wind Direction (°)','Mean Wind Speed (km/h)']

In [None]:
threshold = 30  # Define a distance threshold

In [42]:
distance_matrix = precompute_distances_and_weights_with_threshold(stations, threshold)

In [45]:
filled_df = fill_nulls_with_weighted_average(df, fields_to_fill, distance_matrix)

In [46]:
filled_df.to_csv("data_gcs_combine_update_district_fillna.csv")