Here’s a Python script to check the ground truth coordinates against the combined CSV file.
The code will:
- Look for ground truth coordinates in the combined file.
- Analyze missing features for the found points.
- Count points not found in the combined file.
- Summarize the results in the requested format.

In [1]:
import pandas as pd
import numpy as np

In [3]:
def check_ground_truth(ground_truth_file, combined_file):
    """
    Check the presence and completeness of ground truth points in the combined CSV file.
    
    Args:
    - ground_truth_file: Path to the CSV file containing ground truth coordinates.
    - combined_file: Path to the combined CSV file with features.

    Returns:
    - Summary of findings.
    """
    # Load ground truth and combined data
    ground_truth = pd.read_csv(ground_truth_file)
    combined = pd.read_csv(combined_file)
    
    # Ensure consistent column names for matching
    ground_truth.rename(columns=lambda x: x.strip(), inplace=True)
    combined.rename(columns=lambda x: x.strip(), inplace=True)

    # Extract coordinates from ground truth
    ground_truth_coords = ground_truth[['Latitude', 'Longitude']]

    # Initialize counters
    found_no_missing = 0
    found_missing = {}
    not_found = 0

    # Iterate over ground truth coordinates
    for idx, row in ground_truth_coords.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        
        # Find matching rows in the combined file
        matching_rows = combined[(combined['Latitude'] == lat) & (combined['Longitude'] == lon)]

        if matching_rows.empty:
            # Point not found
            not_found += 1
        else:
            # Check for missing values in the features
            num_missing = matching_rows.iloc[:, 2:].isna().sum(axis=1).iloc[0]

            if num_missing == 0:
                found_no_missing += 1
            else:
                found_missing[num_missing] = found_missing.get(num_missing, 0) + 1

    # Output summary
    print(f"{found_no_missing} points were found and had no missing values.")
    for num_missing, count in found_missing.items():
        print(f"{count} points were found in the combined file and had {num_missing} missing features.")
    print(f"{not_found} points were not found in the combined file.")

# Example usage
ground_truth_file = r'E:/South Thompson/SA_ground_labelled.csv'
combined_file = r'E:/South Thompson/NEW/combined26.csv'
check_ground_truth(ground_truth_file, combined_file)

238 points were found and had no missing values.
6 points were found in the combined file and had 19 missing features.
4 points were not found in the combined file.


In [4]:
import pandas as pd
import numpy as np

def analyze_missing_features(ground_truth_file, combined_file):
    """
    Check the presence and completeness of ground truth points in the combined CSV file.
    Report missing features for points with missing values.
    
    Args:
    - ground_truth_file: Path to the CSV file containing ground truth coordinates.
    - combined_file: Path to the combined CSV file with features.
    """
    # Load ground truth and combined data
    ground_truth = pd.read_csv(ground_truth_file)
    combined = pd.read_csv(combined_file)
    
    # Ensure consistent column names for matching
    ground_truth.rename(columns=lambda x: x.strip(), inplace=True)
    combined.rename(columns=lambda x: x.strip(), inplace=True)

    # Extract coordinates from ground truth
    ground_truth_coords = ground_truth[['Latitude', 'Longitude']]

    # Dictionary to store missing feature info
    missing_features_info = {}

    # Iterate over ground truth coordinates
    for idx, row in ground_truth_coords.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        
        # Find matching rows in the combined file
        matching_rows = combined[(combined['Latitude'] == lat) & (combined['Longitude'] == lon)]

        if not matching_rows.empty:
            # Check for missing values in the features
            missing_features = matching_rows.iloc[:, 2:].isna().sum(axis=0)
            missing_columns = missing_features[missing_features > 0].index.tolist()
            if missing_columns:
                missing_features_info[(lat, lon)] = missing_columns

    # Output results
    print("Missing Features Information:")
    for coord, features in missing_features_info.items():
        print(f"Coordinates: {coord} - Missing Features: {features}")

    if not missing_features_info:
        print("No missing features found for the points with missing data.")
    else:
        print("\nSummary:")
        all_missing_features = pd.Series(
            [feature for features in missing_features_info.values() for feature in features]
        )
        print(all_missing_features.value_counts())

# Example usage
ground_truth_file = r'E:/South Thompson/SA_ground_labelled.csv'
combined_file = r'E:/South Thompson/NEW/combined26.csv'
analyze_missing_features(ground_truth_file, combined_file)

Missing Features Information:
Coordinates: (1410433.75, 645871.25) - Missing Features: ['Terrain_aspect', 'Terrain_chnl_base', 'Terrain_chnl_dist', 'Terrain_convergence', 'Terrain_dah', 'Terrain_hcurv', 'Terrain_ls_factor', 'Terrain_mrrtf', 'Terrain_mrvbf', 'Terrain_openness_neg', 'Terrain_openness_pos', 'Terrain_relative_slope_position', 'Terrain_slope', 'Terrain_tca', 'Terrain_tpi', 'Terrain_tri', 'Terrain_twi', 'Terrain_valley_depth', 'Terrain_vcurv']
Coordinates: (1410583.25, 645871.25) - Missing Features: ['Terrain_aspect', 'Terrain_chnl_base', 'Terrain_chnl_dist', 'Terrain_convergence', 'Terrain_dah', 'Terrain_hcurv', 'Terrain_ls_factor', 'Terrain_mrrtf', 'Terrain_mrvbf', 'Terrain_openness_neg', 'Terrain_openness_pos', 'Terrain_relative_slope_position', 'Terrain_slope', 'Terrain_tca', 'Terrain_tpi', 'Terrain_tri', 'Terrain_twi', 'Terrain_valley_depth', 'Terrain_vcurv']
Coordinates: (1410733.25, 645841.25) - Missing Features: ['Terrain_aspect', 'Terrain_chnl_base', 'Terrain_chnl_d