This script processes a large CSV file in chunks to handle missing values for specific features. For rows with missing data, it calculates the average of the surrounding 8 coordinates based on Latitude and Longitude. The processed data is written to a new file incrementally, ensuring efficient memory usage. The script also tracks the total number of missing values filled and logs details about rows with unavailable surrounding data.

In [None]:
import pandas as pd

# File path
file_path = r'C:/Users/T00701453/Downloads/combined.csv'

# Define chunk size
chunk_size = 1000  # Adjust based on memory

# Define function to get surrounding coordinates
def get_surrounding_coordinates(lat, lon):
    lat_lon_offsets = [-0.5, 0.5]
    surrounding_coords = [(lat + d_lat, lon + d_lon) for d_lat in lat_lon_offsets for d_lon in lat_lon_offsets]
    return surrounding_coords

# Function to handle missing values based on closest 8 pixels for each feature
def process_missing_values(df, feature_columns):
    total_missing_values_filled = 0
    missing_summary = []

    # Iterate through Deach feature column
    for feature in feature_columns:
        # Process rows with missing values in the current feature
        for index, row in df[df[feature].isna()].iterrows():
            lat, lon = row['Latitude'], row['Longitude']
            surrounding_coords = get_surrounding_coordinates(lat, lon)
            
            # Get surrounding values for the current feature
            surrounding_values = []
            missing_coords = []
            
            for s_lat, s_lon in surrounding_coords:
                value = df[(df['Latitude'] == s_lat) & (df['Longitude'] == s_lon)][feature]
                if not value.empty:
                    surrounding_values.append(value.values[0])
                else:
                    missing_coords.append((s_lat, s_lon))
            
            if surrounding_values:
                # Calculate the average from available surrounding values
                average_value = sum(surrounding_values) / len(surrounding_values)
                df.at[index, feature] = average_value
                total_missing_values_filled += 1
                
                # Print missing coordinates if any were absent
                if missing_coords:
                    missing_summary.append(f"For coordinates ({lat}, {lon}) in feature '{feature}', {len(missing_coords)} closest pixels were missing: {missing_coords}")

    return df, total_missing_values_filled, missing_summary

# Load and process file in chunks
total_missing_values_filled = 0
missing_summaries = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Ensure latitude and longitude are floats
    chunk['Latitude'] = chunk['Latitude'].astype(float)
    chunk['Longitude'] = chunk['Longitude'].astype(float)

    # List of feature columns to process for missing values
    feature_columns = [col for col in chunk.columns if col not in ['Latitude', 'Longitude']]

    print("Processing a new chunk...")

    # Process missing values within this chunk for each feature column
    processed_chunk, missing_filled, summary = process_missing_values(chunk, feature_columns)
    
    # Update overall counts
    total_missing_values_filled += missing_filled
    missing_summaries.extend(summary)
    
    # Save the processed chunk
    processed_chunk.to_csv("processed_largefile.csv", mode='a', index=False, header=False)

# Print final summary
print(f"Total missing values filled: {total_missing_values_filled}")
for summary in missing_summaries:
    print(summary)