In [3]:
import os
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import pandas as pd
from tqdm import tqdm
import time

In [4]:
# Load the shapefile
shapefile_path = "E:/Akarsh/District boundary/old by esri/India_district_edited.shp"
gdf = gpd.read_file(shapefile_path)

# Specify the folder containing raster files
raster_folder = "E:/Akarsh/DATA/test/"
raster_files = [file for file in os.listdir(raster_folder) if file.endswith('.tif')]

# Specify the folder to save the CSV file
output_folder = "E:/Akarsh/ML data/output_test"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize an empty dictionary to store results for each district
results_dict = {'District': []}

# Initialize a progress bar with the total number of iterations
total_iterations = len(raster_files) * len(gdf)
pbar = tqdm(total=total_iterations, desc='Processing')

# Measure the start time
start_time = time.time()

# Iterate over each raster file
for raster_file in raster_files:
    # Load the raster file
    raster_path = os.path.join(raster_folder, raster_file)
    src = rasterio.open(raster_path)

    # Initialize a list to store mean values for each district in the current raster
    mean_values = []

    # Iterate over each row in the GeoDataFrame
    for index, row in gdf.iterrows():
        # Extract the geometry of the district
        geom = row['geometry']

        # Use the geometry to mask the raster data
        out_image, out_transform = mask(src, [geom], crop=True)

        # Ensure that the masked values are numeric
        masked_values = out_image.astype(float)

        # Check for invalid values (e.g., nodata) and replace with NaN
        masked_values[masked_values <= src.nodata] = np.nan

        # Check if there are valid pixels in the masked array
        if np.isnan(masked_values).all():
            # If all pixels are NaN, set mean_value to NaN
            mean_value = np.nan
        else:
            # Calculate the mean value of the masked pixels
            mean_value = np.nanmean(masked_values)

        # Append the mean value to the list
        mean_values.append(mean_value)

        # Update the progress bar
        pbar.update(1)

    # Close the raster file
    src.close()

    # Append the list of mean values to the results dictionary
    results_dict[raster_file] = mean_values

# Add district names to the 'District' column
results_dict['District'] = gdf['NAME_2'].tolist()

# Close the progress bar
pbar.close()

# Calculate and print the runtime
end_time = time.time()
runtime = end_time - start_time
print(f"Total runtime: {runtime} seconds")

# Find the maximum length of mean values lists
max_length = max(len(values) for values in results_dict.values())

# Fill shorter lists with NaN to ensure equal lengths
for key, values in results_dict.items():
    if len(values) < max_length:
        results_dict[key] += [np.nan] * (max_length - len(values))

# Create a DataFrame from the results dictionary
result_df = pd.DataFrame(results_dict)

# Save the DataFrame to a CSV file in the specified folder
csv_file_path = os.path.join(output_folder, 'district_mean_values.csv')
result_df.to_csv(csv_file_path, index=False)

Processing: 100%|█████████████████████████████████████████████████████████████████| 5336/5336 [00:42<00:00, 124.70it/s]

Total runtime: 42.791369915008545 seconds



