In [None]:
import json
import unittest
from typing import Any
from pathlib import Path
import pystac
import glob
import sys
from os import path
import pdal
from datetime import datetime, timezone, timedelta
from pystac.asset import Asset
from pystac.errors import ExtensionTypeError, RequiredPropertyMissing, STACError
from pystac.extensions.pointcloud import Schema, SchemaType, AssetPointcloudExtension, PhenomenologyType, PointcloudExtension, Statistic
from pystac.extensions.classification import ItemClassificationExtension,AssetClassificationExtension, Classification
from pystac.summaries import RangeSummary
import logging
import constants
import boto3
from botocore.exceptions import ClientError
import os
import rasterio
from rasterio.transform import from_bounds
from shapely.geometry import Polygon, mapping, shape
from tempfile import TemporaryDirectory
from pyproj import Transformer
from botocore.config import Config
import pandas as pd 
import re
import json 
import gc

config = Config(signature_version='s3v4')

logging.basicConfig(level=logging.ERROR, force=True)

In [None]:
#variables 
# prefixes for saved stac jsons

output_location=r'/stac/output/pc/'

json_out=Path(fr"{output_location}jsons")
# thumbnail_out=fr"{output_location}thumbnails"

# Object Storage where point clouds are stored 
catalog_name='BC_STAC'
collection_name='PC'
public_asset_loc=r'https://nrs.objectstore.gov.bc.ca/gdwuts'

obj_type=r'pointcloud' #the type of asset DEM, DSM, CRH, Landcover ect
file_extension='.laz'
object_store=r"/mnt/r_drive"
map_area="092"
subdivision=['o']
# tiles=[18] #rest of test area with DEMS 19,20,28,29,30
# root_directory = Path(fr"{object_store}/{map_area}") #root_directory = Path(fr"{object_store}/{map_area}/{map_area}{subdivision}")
root_directories = [Path(f"{object_store}/{map_area}/{map_area}{subdiv}") for subdiv in subdivision]
intesity_img_dir=Path(f"{object_store}/intensity_plots/{map_area}/thumbs")
metadata_loc=fr"{object_store}/metadata/laszy_json"
accuracy_loc=fr"{object_store}/metadata/accuracy_rpts"
# location for STAC jsons in s3, used to create permanent s3 links and the base link for all collections and items from this script
collection_loc=rf"{catalog_name}/{obj_type}"

public_asset_intensity= rf"{public_asset_loc}/intensity_plots/{map_area}/thumbs"


# ASPRS Classification Mapping
asprs_classes = {
    0: "Never Classified",
    1: "Unclassified",
    2: "Ground",
    3: "Low Vegetation",
    4: "Medium Vegetation",
    5: "High Vegetation",
    6: "Building",
    7: "Low Noise",
    8: "Model Key-point",
    9: "Water",
    10: "Rail",
    11: "Road Surface",
    12: "Overlap Points",
    13: "Wire - Guard (Shield)",
    14: "Wire - Conductor (Transmission)",
    15: "Transmission Tower",
    16: "Wire-structure Connector",
    17: "Bridge Deck",
    18: "High Noise",
}

In [None]:

#set up s3 connection

# use third party object storage to create an S3 Client
s3_client = boto3.client(
    "s3",
    endpoint_url=constants.AWS_S3_ENDPOINT,
    aws_access_key_id=constants.AWS_ACCESS_KEY_ID,
    aws_secret_access_key=constants.AWS_SECRET_ACCESS_KEY,
    config=config
)

bucket = constants.AWS_S3_BUCKET

In [None]:

#Functions for STAC PC data

def capture_date(pdalinfo):
    import datetime
    year = pdalinfo['creation_year']
    day = pdalinfo['creation_doy']
    date = datetime.datetime(int(year), 1, 1) + datetime.timedelta(int(day) - 1 if int(day) > 1 else int(day))
    return date.isoformat()+'Z'

def convertGeometry(geom, srs):
    from osgeo import ogr
    from osgeo import osr
    in_ref = osr.SpatialReference()
    in_ref.SetFromUserInput(srs)
    out_ref = osr.SpatialReference()
    out_ref.SetFromUserInput('EPSG:4326')

    g = ogr.CreateGeometryFromJson(json.dumps(geom))
    g.AssignSpatialReference(in_ref)
    g.TransformTo(out_ref)
    return json.loads(g.ExportToJson())


def convertBBox(obj):
    output = []
    output.append(float(obj['minx']))
    output.append(float(obj['miny']))
    # output.append(float(obj['minz']))
    output.append(float(obj['maxx']))
    output.append(float(obj['maxy']))
    # output.append(float(obj['maxz']))
    return output

#functions for s3
def create_url(bucket_name: str,
            object_name: str):
 
    """
           
    This function takes a bucket name, an object name, and an expiration time (in seconds) and generates a URL download link for the object.

    Arguments:
        bucket_name: String of name of the bucket
        object_name: Name of the object (key) that the URL will be pointed to

    Returns:
        Link of output (object download) URL
        
    Raises: 
        Exceptions raised will display an error message and be logged in the export.log file
    """
    try:
        if r':443' in constants.AWS_S3_ENDPOINT:
            endpoint=constants.AWS_S3_ENDPOINT.split(':')
            endpoint=fr"{endpoint[0]}:{endpoint[1]}"
        else:
            endpoint=constants.AWS_S3_ENDPOINT
        response=os.path.join(endpoint,bucket_name,object_name)
    except ClientError as e:
        logging.info(e)
        return None
    return response

def set_permissions(bucket_name: str,
                    object_name: str,
                    permissions='public-read'):
    """
    This function takes a bucket name, an object name, and a permissions value (specified below) and sets the object's permissions to the value given.

    Arguments:
        bucket_name: String of name of the bucket
        object_name: Name of the object (key) that the URL will be pointed to
        permissions: If not specified, the permissions will default to 'public-read'. Otherwise, permissions can be found below:
        'private'|'public-read'|'public-read-write'|'authenticated-read'|'aws-exec-read'|'bucket-owner-read'|'bucket-owner-full-control'

    Returns:
        Nothing
        
    Raises: 
        Exceptions raised will display an error message and be logged in the export.log file 
    """

    try:
        response = s3_client.put_object_acl(ACL=permissions, Bucket=bucket_name, Key=object_name)
        logging.info(f'Set permissions on {object_name} success, set to {permissions}')
    except Exception as e:
        logging.info(f'Error when setting permission: double check permission: {permissions}. Refer to help(set_permissions) for documentation.')
        logging.info(e)

    return


def extract_dates(file_path):
    file_path = str(file_path)
    year_match = re.search(r'/(\d{4})/', file_path)
    year = year_match.group(1) if year_match else None
    date_match = re.findall(r'_(\d{8})', file_path)
    
    if len(date_match) == 2:
        start_date = datetime.strptime(date_match[0], "%Y%m%d")
        end_date = datetime.strptime(date_match[1], "%Y%m%d")
    elif len(date_match) == 1:
        start_date = datetime.strptime(date_match[0], "%Y%m%d")
        end_date = start_date
    elif year:
        start_date = datetime.strptime(year, "%Y")
        end_date = start_date
    else:
        start_date, end_date = None, None

    return start_date, end_date

def find_matching_intesity_jpg(directory: str, item_name: str) -> str | None:
    for filename in os.listdir(directory):
        if filename.startswith(f"{item_name}_") and filename.endswith(("_grid.jpg", "_grid.jpeg")):
            return os.path.join(directory, filename)
    return None

In [None]:
#create base url for all json hrefs 
base_url=create_url(bucket, collection_loc)
print(base_url)

In [None]:
# create STAC Catalog 
#https://pystac.readthedocs.io/en/stable/api/catalog.html

url=create_url(bucket, os.path.join(catalog_name,f"{catalog_name}_catalog.json"))

catalog = pystac.Catalog(id=catalog_name,
                        description='This catalog is contains Open Data from the Province of BC',
                        stac_extensions=None, #if not none list of extension
                        extra_fields = None, #if not none dictionary
                        href=url,
                        catalog_type='ABSOLUTE_PUBLISHED', # https://pystac.readthedocs.io/en/stable/api/pystac.html#pystac.CatalogType
                        ) # https://pystac.readthedocs.io/en/stable/api/layout.html#pystac.layout.BestPracticesLayoutStrategy    https://github.com/radiantearth/stac-spec/blob/v1.1.0/best-practices.md#catalog-layout
                        

print(json.dumps(catalog.to_dict(), indent=4))


print(F"THIS IS THE URL {url}   !!!!!!!!!!!!!!!!!!!!!!!!")

In [None]:
laz_files = []
for root_directory in root_directories:
    laz_files.extend(root_directory.rglob(f"{obj_type.lower()}/*{file_extension}"))

print(f"{len(laz_files)} {file_extension} files found")

In [None]:
#check for existing files in output folder 


laz_files_dict = {os.path.basename(f).rsplit('.', 1)[0]: f for f in laz_files}

for root, dirs, files in os.walk(json_out):
    for d in dirs:
        if d in laz_files_dict:
            laz_files.remove(laz_files_dict[d])
print(len(laz_files))


In [None]:
#set up pdal processing to create json structures 
batch_size = 25
item_count=0
total_files = len(laz_files)
size_limit = 230 * 1024 * 1024  # 230 MB in bytes
while item_count < total_files:
    batch = laz_files[item_count:item_count + batch_size]
    
    for filename in batch:
        
        try:
            item_count=item_count+1
            asset_suffix=str(filename).split('drive')[-1]
            asset_suffix=asset_suffix.split(file_extension)[0]  
            
            asset_file=fr"{public_asset_loc}{asset_suffix}"
            asset_href=fr"{asset_file}{file_extension}"
        
            item_name=str(filename).split('/')[-1]
            item_name=item_name.split(file_extension)[0]
            asset_name=fr"{item_name}_{file_extension.split('.')[1]}"


            item_href=f"{base_url}/{item_name}/{item_name}.json"
            # thumbnail_file=f"{base_url}/{item_name}/{item_name}.png"
            out_filename = str(f"{json_out}/{item_name}.json")
            
            print(item_name)
            
            # pdal info --all call references hexbin, stats, and info filters
            r = pdal.Reader.las(filename) #our laz files are not Cloud Optimized Point Clouds (COPC)
            hb = pdal.Filter.hexbin()
            s = pdal.Filter.stats()
            i = pdal.Filter.info()
            # c = pdal.Filter.range(limits="Classification[1:255]") # classifications removed for time being keeps crashing on larger files 
            gt=pdal.Filter.gpstimeconvert(conversion="gt2gst")

            pipeline: pdal.Pipeline = r | hb | s | i | gt #| c 

            count = pipeline.execute()
            
            logging.info('PDAL pipline completed')
                        
            boundary = pipeline.metadata['metadata'][hb.type]
            stats = pipeline.metadata['metadata'][s.type]
            info = pipeline.metadata['metadata'][i.type]
            copc = pipeline.metadata['metadata'][r.type]
            # point_class=pipeline.metadata['metadata'][c.type]
            gps_times=pipeline.metadata['metadata'][gt.type]
            
            gps_epoch = datetime(1980, 1, 6)
            leap_seconds = 18
            
            if not gps_times:
                print("No GPS time data found in file. pulling from metadata json")
                with open(fr"{metadata_loc}/{item_name}.json", "r") as f:
                    metadata_fromfile = json.load(f)
                start_date = metadata_fromfile["point_records"].get("date_start", "N/A")
                start_date = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
                end_date = metadata_fromfile["point_records"].get("date_end", "N/A")
                end_date = datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
                
            else:
                # Convert GPS time to UTC (subtract leap seconds)
                utc_times = [gps_epoch + timedelta(seconds=float(t) - leap_seconds) for t in gps_times]
                start_date = min(utc_times)
                end_date = max(utc_times)    
            
            
            # # Extract filtered point data
            # if len(pipeline.arrays) > 0:
            #     classified_points = pipeline.arrays[0]  
            #     df = pd.DataFrame(classified_points)  

                
            #     classification_counts = df["Classification"].value_counts().reset_index()
            #     classification_counts.columns = ["value", "count"]

            #     # Map classification values to ASPRS class names
            #     classification_counts["name"] = classification_counts["value"].map(asprs_classes).fillna("User-defined/Reserved")
                
            #     #get gps time and convert to human readable 
                
            #     # gps_times  = pipeline.arrays[0]["GpsTime"] 
            #     # gps_epoch = datetime(1980, 1, 6, 0, 0, 0)
            #     # # Approximate leap seconds (PDAL doesn't apply them automatically)
            #     # leap_seconds = 18  # As of 2024
            #     # utc_times = [gps_epoch + timedelta(seconds=float(t) - leap_seconds) for t in gps_times]
                
            #     # Print or store results
            #     print(f"Classification Counts for {filename}:")
            #     print(classification_counts, "\n")
            # else:
            #     logging.warning(f"No classified points found in {filename}")

            output = {}

            try:
                output['geometry'] = convertGeometry(
                    boundary['boundary_json'],
                    copc['comp_spatialreference']
                )
            except KeyError:
                output['geometry'] = stats['bbox']['EPSG:4326']['boundary']

            output['bbox'] = convertBBox(stats['bbox']['EPSG:4326']['bbox'])
            output['id'] = path.basename(asset_file)
            output['type'] = 'Feature'

            assets = {'data': {'href': asset_file}}
            properties = {}

            properties['pc:schemas'] = info['schema']['dimensions']
            properties['pc:statistics'] = stats['statistic']
            properties['title'] = "LiDAR BC"
            properties['providers'] = [
                {
                    "name": "LidarBC",
                    "description": "LidarBC is an initiative to provide open public access to Light Detection and Ranging data (lidar) and associated datasets collected by the Government of British Columbia",
                    "roles": [
                    "producer",
                    ],
                    "url": "https://lidar.gov.bc.ca/"
                }
            ]
            properties['pc:type'] = 'lidar' # eopc, lidar, radar, sonar
            try:
                properties['pc:density'] = boundary['avg_pt_per_sq_unit']
            except KeyError:
                properties['pc:density'] = 0
            properties['pc:count'] = count

            properties['datetime'] = capture_date(copc)

            output['properties'] = properties
            output['assets'] = assets
            output['stac_extensions'] = ['https://stac-extensions.github.io/pointcloud/v1.0.0/schema.json']
            output['stac_version'] = '1.0.0'
            
            self_link = {'rel':'alternate',"href":item_href} #need to change to s3 bucket public link to json
            lic_link = {'rel':'license',"href":'https://www2.gov.bc.ca/gov/content/data/policy-standards/open-data/open-government-licence-bc'} #need to change
            output['links'] = [self_link, lic_link]

            # with open(out_filename, 'w') as laz_out:
            #     laz_out.write(json.dumps(output, sort_keys=True, indent=2,
            #                                 separators=(',', ': ')))
            
            
            logging.info('json dumps PC info to local')
            
            # laz_file_name = filename.stem
            # corresponding_json = out_filename
            logging.info('start STAC item creation')
            # Create and validate STAC item
            
            item = pystac.Item(
                id=item_name,
                geometry=output["geometry"],
                bbox=output["bbox"],
                datetime=start_date,
                properties=output["properties"],
                start_datetime=start_date,
                end_datetime=end_date,
                href= item_href
            )
            
            item.common_metadata.created=datetime.now(tz=timezone.utc)
            item.common_metadata.updated=None

            
            logging.info('add assets')
            # Add the data asset
            item.add_asset(
                key=asset_name,
                asset=pystac.Asset(
                    href=str(asset_href),
                    media_type="application/vnd.laszip+copc"
                )
            )    

            #find intenisty jpg 
            intesity_jpg=find_matching_intesity_jpg(intesity_img_dir,item_name)
            
            if intesity_jpg is not None:
                
                intesity_jpg=os.path.basename(intesity_jpg)
                public_asset_intensity= rf"{public_asset_loc}/intensity_plots/{map_area}/thumbs/{intesity_jpg}"
            
            
                #add thumbnail
                item.add_asset(
                key=f"{item_name}_intensity_grid",
                asset=pystac.Asset(
                    href=public_asset_intensity,
                    media_type=pystac.MediaType.PNG,
                    roles=['thumbnail','overview','visual']
                    )
                )
            #add links
            # "self" link
            self_link = pystac.Link(
                rel='alternate',
                target=item_href
            )

            # "license" link
            lic_link = pystac.Link(
                rel="license",
                target="https://www2.gov.bc.ca/gov/content/data/policy-standards/open-data/open-government-licence-bc"
            )

            # Add them to the item
            item.links.extend([self_link, lic_link])

            output["links"] = [
                {"rel": 'alternate', "href": item_href},
                {"rel": "license", "href": "https://www2.gov.bc.ca/gov/content/data/policy-standards/open-data/open-government-licence-bc"}
            ]

            if 'dimensions' in output:
                print('key exists')
                
            #set point cloud extension
            pc_ext = PointcloudExtension.ext(item, add_if_missing=True)

            # Number of points
            pc_ext.count = count

            # Type of phenomenology (LiDAR, radar, etc.)
            pc_ext.type = PhenomenologyType.LIDAR  

            # If you have an encoding (e.g., LAZ, COPC)
            pc_ext.encoding = "LAZ"

            # Density (points per square unit)
            try:
                pc_ext.density = output['properties']['pc:density']
            except KeyError:
                pc_ext.density = 0

            # --------------- SCHEMA ---------------

            dimension_dicts = output['properties']['pc:schemas']  

            schemas = []
            for d in dimension_dicts:
                stype = {
                    "floating": SchemaType.FLOATING,
                    "unsigned": SchemaType.UNSIGNED,
                    "signed":   SchemaType.SIGNED
                }.get(d["type"], SchemaType.FLOATING)

                s = Schema({
                    "name": d["name"],
                    "size": d["size"],
                    "type": stype.value 
                })
                schemas.append(s)

            pc_ext.schemas = [s for s in schemas]  

            # -------------- STATISTICS --------------

            stat_dicts = output["properties"]["pc:statistics"]
            statistics = []

            for s in stat_dicts:
                # Build a dict
                stat_data = {
                    "name": s["name"],
                    "average": s.get("average"),
                    "count": s.get("count"),
                    "minimum": s.get("minimum"),
                    "maximum": s.get("maximum"),
                    "stddev": s.get("stddev")
                }

                # Wrap that dict with Statistic(...)
                statistics.append(Statistic(stat_data))

            pc_ext.statistics = statistics
            
            #classificatiosn extension removed for time being 
            
            # classification_ext  = ItemClassificationExtension.ext(item, add_if_missing=True)

            # # Define classifications
            # classifications = []
            # for _, row in classification_counts.iterrows():
            #     classification = Classification.create(
            #         value=int(row["value"]),
            #         name=row["name"].replace(" ", "_"),
            #         count=int(row["count"])
            #     )
            #     classifications.append(classification)

            # # Assign classifications to the asset
            # classification_ext.classes = classifications

            item.validate()

            item_path = out_filename
            # item.save_object(include_self_link= False, dest_href=str(item_path)) #need to change to s3 bucket and make link public 
            catalog.add_item(item)
            print(f'items processed: {item_count}')
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            break
    #get collection bounds
    unioned_footprint = None
    datetime_list=[]
    item_list=[]
    for item in catalog.get_all_items():
        datetime_list.append(item.datetime)
        item_list.append(item)
        footprint = item.geometry
    # Convert the footprint geometry to a Shapely shape
        footprint_shape = shape(footprint)
        
        # Perform union operation
        if unioned_footprint is None:
            # If unioned_footprint is None (first iteration), initialize it with the first footprint
            unioned_footprint = footprint_shape
        else:
            # Otherwise, perform union with the current footprint
            unioned_footprint = unioned_footprint.union(footprint_shape)

    collection_bbox = list(unioned_footprint.bounds)
    spatial_extent = pystac.SpatialExtent(bboxes=[collection_bbox])
    
    datetime_list = [
        dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt.astimezone(timezone.utc)
        for dt in datetime_list
    ]

    collection_interval = sorted(datetime_list)
    temporal_extent = pystac.TemporalExtent(intervals=[collection_interval])
    collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
    
    # colection_id='Point-Cloud-Collection-Test'
    collection_href=os.path.join(base_url,f"{collection_name}.json")
    print(collection_href)
    collection = pystac.Collection(id=obj_type,
                                description='LiDAR Point Clouds for British Columbia',
                                extent=collection_extent,
                                title=obj_type,
                                href=collection_href,
                                license='Apache-2.0')


    url=create_url(bucket, os.path.join(collection_name,f"{collection_name}.json"))
    print(F"THIS IS THE URL {url}   !!!!!!!!!!!!!!!!!!!!!!!!")
    
    collection.add_items(item_list)
    catalog.add_child(collection)
    print("Catalog HREF: ", catalog.get_self_href())
    print("Collection HREF:", collection.get_self_href())
    print("Item HREF: ", item.get_self_href())
    
    catalog.save(catalog_type=pystac.CatalogType.ABSOLUTE_PUBLISHED, dest_href=fr"{json_out}")
            


    print(f"Processed {item_count}/{total_files} files.")

print("All files processed.")
        