This script processes large geospatial datasets to analyze missing values using a breadth-first search (BFS) approach. It calculates the distance to the nearest valid pixel for each missing value and generates a summary of the distance distribution. It calculates the distance to the nearest valid pixel for each missing value and generates a summary of the distance distribution. Using chunk-based processing, it outputs a distance summary for missing values (e.g., how many missing points are 1, 2, or more steps away).

In [1]:
import pandas as pd
from collections import deque

In [2]:
# Define chunk size for windowing
chunk_size = 1000  # Adjust based on memory

# Define function to calculate distance
def calculate_distance(lat1, lon1, lat2, lon2):
    return abs(lat1 - lat2) + abs(lon1 - lon2)

# Define function to find the nearest non-missing pixel
def find_nearest_pixel(df, feature, missing_lat, missing_lon):
    visited = set()
    queue = deque([(missing_lat, missing_lon, 0)])  # Start with distance 0
    while queue:
        lat, lon, dist = queue.popleft()
        if (lat, lon) in visited:
            continue
        visited.add((lat, lon))
        
        # Check if current pixel has a valid value
        value = df[(df['Latitude'] == lat) & (df['Longitude'] == lon)][feature]
        if not value.empty and pd.notna(value.values[0]):
            return dist  # Return the distance to the nearest non-missing pixel
        
        # Add neighbors to the queue
        for d_lat, d_lon in [(-0.5, 0), (0.5, 0), (0, -0.5), (0, 0.5)]:
            queue.append((lat + d_lat, lon + d_lon, dist + 1))
    
    return float('inf')  # If no non-missing pixel is found

# Function to process missing values and calculate distances
def process_missing_values_with_distances(df, feature_columns):
    distance_summary = {}

    for feature in feature_columns:
        for index, row in df[df[feature].isna()].iterrows():
            lat, lon = row['Latitude'], row['Longitude']
            distance = find_nearest_pixel(df, feature, lat, lon)
            
            # Update the distance summary
            if distance not in distance_summary:
                distance_summary[distance] = 0
            distance_summary[distance] += 1
    
    return distance_summary

In [None]:
# Load and process file in chunks
distance_summaries = {}

for chunk in pd.read_csv('C:/Users/T00701453/Downloads/combined26.csv', chunksize=chunk_size):
    # Ensure latitude and longitude are floats
    chunk['Latitude'] = chunk['Latitude'].astype(float)
    chunk['Longitude'] = chunk['Longitude'].astype(float)

    # List of feature columns to process for missing values
    feature_columns = [col for col in chunk.columns if col not in ['Latitude', 'Longitude']]

    # Calculate distance summary for this chunk
    chunk_distances = process_missing_values_with_distances(chunk, feature_columns)
    
    # Update overall distance summaries
    for distance, count in chunk_distances.items():
        if distance not in distance_summaries:
            distance_summaries[distance] = 0
        distance_summaries[distance] += count

# Print final summary
print("Summary of missing value distances:")
for distance, count in sorted(distance_summaries.items()):
    if distance == float('inf'):
        print(f"{count} data points have no nearby non-missing pixels.")
    else:
        print(f"{count} data points have the closest pixels {distance} distance(s) away.")