This script processes large geospatial datasets to handle missing values by replacing them with the average of up to 8 nearest valid pixels using a breadth-first search (BFS) approach. It supports chunk-based processing for memory efficiency and removes rows where no valid neighbors are found within a search distance of 10 units. Outputs include a cleaned CSV file and a count of filled missing values.

In [1]:
import pandas as pd
from collections import deque
import numpy as np

In [2]:
# Define chunk size for windowing
chunk_size = 1000  # Adjust based on memory

# Define function to calculate average from neighbors
def calculate_average_from_neighbors(df, feature, missing_lat, missing_lon):
    visited = set()
    queue = deque([(missing_lat, missing_lon, 0)])  # Start with distance 0
    neighbors = []
    max_distance = 10  # Set a maximum search distance to avoid infinite loops

    while queue and len(neighbors) < 8:  # Stop once we have 8 neighbors or exhaust the queue
        lat, lon, dist = queue.popleft()
        if (lat, lon) in visited:
            continue
        visited.add((lat, lon))

        # Check if current pixel has a valid value
        value = df[(df['Latitude'] == lat) & (df['Longitude'] == lon)][feature]
        if not value.empty and pd.notna(value.values[0]):
            neighbors.append(value.values[0])  # Add valid pixel value to neighbors

        # Add neighbors to the queue
        for d_lat, d_lon in [(-0.5, 0), (0.5, 0), (0, -0.5), (0, 0.5)]:
            queue.append((lat + d_lat, lon + d_lon, dist + 1))

        # Break if we reach max distance
        if dist > max_distance:
            break

    # Return the average of collected neighbors if any, else NaN
    if neighbors:
        return np.mean(neighbors)
    return np.nan

# Function to replace missing values with averages or mark for deletion
def replace_missing_values_with_average_or_delete(df, feature_columns):
    to_remove_indices = []
    total_missing_values_filled = 0

    for feature in feature_columns:
        for index, row in df[df[feature].isna()].iterrows():
            lat, lon = row['Latitude'], row['Longitude']
            average_value = calculate_average_from_neighbors(df, feature, lat, lon)

            if pd.notna(average_value):  # Replace if an average is calculated
                df.at[index, feature] = average_value
                total_missing_values_filled += 1
            else:
                to_remove_indices.append(index)  # Mark for removal if no neighbors found

    # Remove rows marked for deletion
    df.drop(index=to_remove_indices, inplace=True)

    return df, total_missing_values_filled

In [4]:
# Load and process file in chunks
total_missing_values_filled = 0

for chunk in pd.read_csv('C:/Users/T00701453/Downloads/combined.csv', chunksize=chunk_size):
    # Ensure Latitude and Longitude are floats
    chunk['Latitude'] = chunk['Latitude'].astype(float)
    chunk['Longitude'] = chunk['Longitude'].astype(float)

    # List of feature columns to process for missing values
    feature_columns = [col for col in chunk.columns if col not in ['Latitude', 'Longitude']]

    # Replace missing values with averages or delete rows
    processed_chunk, missing_filled = replace_missing_values_with_average_or_delete(chunk, feature_columns)

    # Update overall counts
    total_missing_values_filled += missing_filled

    # Save the processed chunk
    processed_chunk.to_csv("processed_combined26.csv", mode='a', index=False, header=False)

# Print final summary
print(f"Total missing values filled: {total_missing_values_filled}")
print("Rows with no valid neighbors were removed.")

KeyboardInterrupt: 