In [66]:
import os
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import pandas as pd
from shapely.geometry import box
import numpy as np

import json
from shapely.geometry import shape

In [67]:
base_folder = "C:\Anna_Data_D_files\sem6\z_capstone\Agriculture\A_Phase_1\datasets\DiCRA\Telengana"

# Lists for the types of factors, years, and specific factors
types_of_factors = ['Environmental']  # Add more as needed
years = ['2023', '2022']  # Add more years as needed
factors = {
    'Environmental': ['SSM', 'LST', 'TEMPERATURE','PRECIPITATION', 'NO2', 'PM25'] # Add more as needed
}

In [68]:

def extract_centroids_from_geojson(filepath):
    data_list = []
    with open(filepath, 'r') as file:
        geojson = json.load(file)

        # Loop through each feature
        for feature in geojson.get('features', []):
            properties = feature.get('properties', {})
            centroid = properties.get('centroid', [None, None])  # Extract centroid
            # district_name= properties.get(district_name, None)
            # Prepare a dictionary of properties including the centroid
            data = {
                'district_name': properties.get('district_name', None),  # Extract the name_key property
                'centroid': centroid
            }
            data_list.append(data)
    return pd.DataFrame(data_list)



In [69]:
# def process_vector(vector_files, comprehensive_df):
#     # Process vector files
#     for vector_file in vector_files:
#         # Only process files in the 'DISTRICT' folder
#         if 'DISTRICT' in vector_file:
#             gdf = gpd.read_file(vector_file)
#             gdf = gdf.to_crs(epsg=4326)  # Convert to WGS 84 if necessary

#             # Extract the zonal statistics from the 'zonalstat' column (which is in dictionary format)
#             zonalstats = gdf['zonalstat'].apply(pd.Series)  # Convert dictionary to separate columns

#             # Combine zonal statistics with the rest of the DataFrame
#             vector_data = gdf.drop(columns=['geometry', 'zonalstat'])  # Drop 'geometry' and 'zonalstat' for now
#             vector_data = pd.concat([vector_data, zonalstats], axis=1)  # Add extracted zonal statistics

#             vector_data['geometry'] = gdf['geometry'].apply(lambda geom: geom.bounds)  # Add geometry bounds as a feature
#             vector_data['source_file'] = os.path.basename(vector_file)  # Track source file

#             # Append to comprehensive DataFrame (update in-place)
#             comprehensive_df = pd.concat([comprehensive_df, vector_data], ignore_index=True)
    
#     return comprehensive_df  # Return updated DataFrame

def process_vector(vector_files, comprehensive_df):
    # Process vector files
    for vector_file in vector_files:
        # Only process files in the 'DISTRICT' folder
        if 'DISTRICT' in vector_file:
            
            gdf = gpd.read_file(vector_file)
            gdf = gdf.to_crs(epsg=4326)  # Convert to WGS 84 if necessary

            # Extract the zonal statistics from the 'zonalstat' column (which is in dictionary format)
            zonalstats = gdf['zonalstat'].apply(pd.Series)  # Convert dictionary to separate columns

            # Combine zonal statistics with the rest of the DataFrame
            vector_data = gdf.drop(columns=['zonalstat'])  # Drop 'geometry' and 'zonalstat' for now
            vector_data = pd.concat([vector_data, zonalstats], axis=1)  # Add extracted zonal statistics
            if 'centroid' in comprehensive_df.columns:
                comprehensive_df = comprehensive_df.drop(columns=['centroid'])
            # Extract centroid from the 'properties' of each feature
            # vector_data['centroid'] = gdf.apply(lambda row: row['centroid'] if 'centroid' in row else [None, None], axis=1)

            vector_data['source_file'] = os.path.basename(vector_file)  # Track source file

            # Append to comprehensive DataFrame (update in-place)
            comprehensive_df = pd.concat([comprehensive_df, vector_data], ignore_index=True)

            centroid_df= extract_centroids_from_geojson(vector_file)
            comprehensive_df = comprehensive_df.merge(centroid_df, how='left', on='district_name')
    
    return comprehensive_df  # Return updated DataFrame



In [70]:
def process_raster(raster_files, comprehensive_df):
    # Process raster files
    for raster_file in raster_files:
        with rasterio.open(raster_file) as src:
            # Extract metadata and create a bounding box
            raster_bounds = src.bounds
            raster_crs = src.crs

            # Convert raster bounds to a geometry object
            bbox_geom = box(*raster_bounds)

            # Prepare a DataFrame with just raster metadata
            raster_df = pd.DataFrame({
                'bounds': [bbox_geom.bounds],  # Bounding box of the raster
                'source_file': [os.path.basename(raster_file)]  # Track the source file
            })

            # Append to comprehensive DataFrame (update in-place)
            comprehensive_df = pd.concat([comprehensive_df, raster_df], ignore_index=True)

    return comprehensive_df  # Return the updated DataFrame


In [71]:
def preprocess(folder_path):
    # Lists to store vector and raster file paths
    vector_files = []
    raster_files = []
    
    # Traverse the directory to segregate vector and raster files
    for root, dirs, files in os.walk(folder_path):
        # print(root, dirs, files)
        for file in files:
            if file.endswith(('.shp', '.geojson', '.kml')):  # Vector file extensions
                vector_files.append(os.path.join(root, file))
            elif file.endswith(('.tif', '.tiff')):  # Raster file extensions
                raster_files.append(os.path.join(root, file))

    comprehensive_df = pd.DataFrame()
    if vector_files:
        comprehensive_df = process_vector(vector_files, comprehensive_df)
    if raster_files:
        comprehensive_df = process_raster(raster_files, comprehensive_df)

    factor_name = folder_path.split(os.sep)[-1]  # Use os.sep for cross-platform compatibility
    factor_name = factor_name + '.csv'
    print(factor_name)

    r = 'C:\Anna_Data_D_files\sem6\z_capstone\Agriculture\A_Phase_2\code- review2\preprocessed_files\DICRA'
    comprehensive_df.to_csv(os.path.join(r, factor_name), index=False)
    print(f"-----------------------------------------------------")

In [72]:
def process_files_for_factors(base_folder, types_of_factors, years, factors):
    for factor_type in types_of_factors:
        for year in years:
            for factor in factors.get(factor_type, []):
                # Append the year to the factor name
                factor_with_year = f"{factor}_{year}"
                
                # Build the directory path
                folder_path = os.path.join(base_folder, factor_type, year, factor_with_year)
                
                # Check if folder exists
                if os.path.exists(folder_path):
                    # print(folder_path)
                    preprocess(folder_path)
                else:
                    print(f"Folder not found: {folder_path}")

# Call the function to start processing
process_files_for_factors(base_folder, types_of_factors, years, factors)

SSM_2023.csv
-----------------------------------------------------
LST_2023.csv
-----------------------------------------------------
TEMPERATURE_2023.csv
-----------------------------------------------------
PRECIPITATION_2023.csv
-----------------------------------------------------
NO2_2023.csv
-----------------------------------------------------
PM25_2023.csv
-----------------------------------------------------
SSM_2022.csv
-----------------------------------------------------
LST_2022.csv
-----------------------------------------------------
TEMPERATURE_2022.csv
-----------------------------------------------------
PRECIPITATION_2022.csv
-----------------------------------------------------
NO2_2022.csv
-----------------------------------------------------
PM25_2022.csv
-----------------------------------------------------
