<a href="https://colab.research.google.com/github/aborbala/tree-canopy/blob/main/01_04_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install geopandas shapely scikit-learn rasterio opencv-python-headless



In [None]:
import os
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon
from sklearn.decomposition import PCA
import cv2
import rasterio
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_length_width(polygon):
    if polygon.is_empty or polygon.geom_type != 'Polygon':
        return None, None

    coords = np.array(polygon.exterior.coords)
    coords -= coords.mean(axis=0)

    pca = PCA(2)
    coords_pca = pca.fit_transform(coords)

    polygon_pca = Polygon(coords_pca)
    length = polygon_pca.bounds[2] - polygon_pca.bounds[0]
    width = polygon_pca.bounds[3] - polygon_pca.bounds[1]

    return max(length, width), min(length, width)

In [None]:
def satisfies_ratio(polygon):
    length, width = get_length_width(polygon)
    if width == 0:
        return False
    ratio = length / width
    return ratio <= 2.4


In [None]:
input_dir = '/content/drive/My Drive/data/400_5816/crowns'
output_dir = '/content/drive/My Drive/data/400_5816/crowns_clean'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [None]:
for filename in os.listdir(input_dir):
    if filename.endswith('.geojson'):
        filepath = os.path.join(input_dir, filename)
        gdf = gpd.read_file(filepath)

        # Filter the geometries based on the ratio condition
        cleaned_geometries = []
        for geom in gdf.geometry:
            if geom is not None:  # Add this check
              if geom.geom_type == 'Polygon' and satisfies_ratio(geom):
                  cleaned_geometries.append(geom)
              elif geom.geom_type == 'MultiPolygon':
                  for poly in geom.geoms:
                      if satisfies_ratio(poly):
                          cleaned_geometries.append(poly)

        # Create a new GeoDataFrame with the cleaned geometries
        cleaned_gdf = gpd.GeoDataFrame(geometry=cleaned_geometries, crs=gdf.crs)

        # Save the cleaned geometries to a new GeoJSON file in the output directory
        cleaned_filepath = os.path.join(output_dir, filename)
        cleaned_gdf.to_file(cleaned_filepath, driver='GeoJSON')

print("Cleaning and saving completed.")

Cleaning and saving completed.
