In [51]:
import geopandas as gpd
!pip install fiona 


Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [63]:
import zipfile
import os

# Provide the path to your zip file
zip_file_path = 'spatial-vector-lidar.zip'

# Provide the directory where you want to extract the contents
extracted_dir = 'Decision Analytics'

# Create the extraction directory if it doesn't exist
os.makedirs(extracted_dir, exist_ok=True)

# Open the zip file and extract its contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)

# Display a message indicating successful extraction
print(f"Successfully extracted contents to {extracted_dir}")

Successfully extracted contents to Decision Analytics


In [96]:
# just checking to see if everything imported
sjer_plot_locations = gpd.read_file('Decision Analytics/california/neon-sjer-site/vector_data/SJER_plot_centroids.shp')
sjer_plot_locations.info()



<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Plot_ID    18 non-null     object  
 1   Point      18 non-null     object  
 2   northing   18 non-null     float64 
 3   easting    18 non-null     float64 
 4   plot_type  18 non-null     object  
 5   geometry   18 non-null     geometry
dtypes: float64(2), geometry(1), object(3)
memory usage: 996.0+ bytes


**Random Attribute Change using Shapefile**

In [135]:
import warnings

# Suppress the specific warning
warnings.filterwarnings("ignore", category=UserWarning, message="Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.")

In [166]:
import time
import random


shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_plot_centroids.shp'

gdf = gpd.read_file(shapefile_path)

# Define the number of Monte Carlo simulation iterations
simulation_iterations = 1000

# Perform Monte Carlo simulation for writing with random attribute changes
total_elapsed_time = 0

for _ in range(simulation_iterations):
    simulated_gdf = gdf.copy()

    # Simulate a random attribute change
    random_row_index = random.choice(simulated_gdf.index)
    random_attribute = 'SJER_plot_centroids'
    simulated_gdf.at[random_row_index, random_attribute] = random.uniform(0, 100)

    # Measure the time it takes to write the simulated GeoDataFrame to a new file
    start_time = time.time()
    simulated_gdf.to_file('Simulation 1/new.shp', driver='ESRI Shapefile')
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average writing time
print(f"Average Writing Time with Random Attribute Changes: {average_elapsed_time} seconds")



Average Writing Time with Random Attribute Changes: 0.047073736429214474 seconds


**Random Attribute Changes using GeoJSON**

In [164]:
#Converting to GeoJson 

shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_plot_centroids.shp'
gdf = gpd.read_file(shapefile_path)

# Provide the path for the output GeoJSON file
geojson_output_path = 'Simulation1geoj.geojson'

# Save the GeoDataFrame to GeoJSON
gdf.to_file(geojson_output_path, driver='GeoJSON')

In [177]:
import fiona
import geopandas as gpd
from shapely.geometry import Point
import time
import random

# Provide the path to your GeoJSON file
geojson_path = 'Simulation1geoj.geojson'

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(geojson_path)

# Define the number of Monte Carlo simulation iterations
simulation_iterations = 1000

# Perform Monte Carlo simulation for writing with random attribute changes
total_elapsed_time = 0

for i in range(1, simulation_iterations + 1):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random attribute change
    random_row_index = random.randint(0, len(simulated_gdf) - 1)
    random_attribute = 'SJER_plot_centroids'
    simulated_gdf.at[random_row_index, random_attribute] = random.uniform(0, 100)

    # Measure the time it takes to write the simulated GeoDataFrame to a new file
    start_time = time.time()

    # Convert the GeoDataFrame to a shapefile using Fiona
    schema = {
        'geometry': 'Point',
        'properties': {random_attribute: 'float'}
    }

    with fiona.open('Simulation 1/new_fileF.shp', 'w', 'ESRI Shapefile', schema, gdf.crs) as output:
        for _, row in simulated_gdf.iterrows():
            point = Point(row['geometry'].x, row['geometry'].y)
            output.write({'geometry': mapping(point), 'properties': {random_attribute: row[random_attribute]}})

    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average writing time
print(f"Average Writing Time with Random Attribute Changes: {average_elapsed_time} seconds")


Average Writing Time with Random Attribute Changes: 0.04359104943275452 seconds


**Random Attribute Changes using GeoPackage**

In [184]:
#Converting GeoJSON to GeoPackage

# Paths to your GeoJSON and GeoPackage files
geojson_path = 'Simulation1geoj.geojson'
geopackage_path = 'Simulation 1/new_fileP.gpkg'

# Open the GeoJSON file for reading
with fiona.open(geojson_path, 'r') as src:
    # Get the schema (including geometry type) from the GeoJSON file
    schema = src.schema
    # Open the GeoPackage file for writing, specifying the schema
    with fiona.open(geopackage_path, 'w', driver='GPKG', crs=src.crs, schema=schema) as dst:
        # Iterate through features in the GeoJSON file and write them to the GeoPackage file
        for feature in src:
            dst.write(feature)

In [188]:
import numpy as np

# Paths to your GeoPackage files
geopackage_input_path = 'Simulation 1/new_fileP.gpkg'
geopackage_output_path = 'Simulation 1/new_output.gpkg'

# Read the GeoPackage file into a GeoDataFrame
gdf = gpd.read_file(geopackage_input_path)

# Define the number of Monte Carlo simulation iterations
simulation_iterations = 1000

# Perform Monte Carlo simulation for writing with random attribute changes
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random attribute change
    random_row_index = np.random.choice(simulated_gdf.index)
    random_attribute = 'SJER_plot_centroids'
    simulated_gdf.loc[random_row_index, random_attribute] = np.random.uniform(0, 100)

    # Measure the time it takes to write the simulated GeoDataFrame to a new file
    start_time = time.time()
    simulated_gdf.to_file(geopackage_output_path, driver='GPKG')
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average writing time
print(f"Average Writing Time with Random Attribute Changes: {average_elapsed_time} seconds")


Average Writing Time with Random Attribute Changes: 0.20252396273612977 seconds


**Buffering using Shapefile**

In [196]:
# Path to your shapefile
shapefile_path = 'Decision Analytics/california/CA_Counties/CA_Counties_TIGER2016.shp'

# Read the shapefile into a GeoDataFrame
gdf = gpd.read_file(shapefile_path)

# Define the number of Monte Carlo simulation iterations
simulation_iterations = 1000

# Perform Monte Carlo simulation for buffering with random distances
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random buffer distance for each feature
    simulated_gdf['buffer_distance'] = np.random.uniform(50, 200, len(simulated_gdf))

    # Measure the time it takes to create buffered geometries with random distances
    start_time = time.time()
    simulated_gdf['buffered_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].buffer(row['buffer_distance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average buffering time
print(f"Average Buffering Time with Random Distances: {average_elapsed_time} seconds")


Average Buffering Time with Random Distances: 0.37639755868911745 seconds


**Buffering using GeoJSON**

In [199]:
#Converting to GeoJson 

shapefile_path = 'Decision Analytics/california/CA_Counties/CA_Counties_TIGER2016.shp'
gdf = gpd.read_file(shapefile_path)

geojson_output_path = 'Simulation2geoj.geojson'

gdf.to_file(geojson_output_path, driver='GeoJSON')

In [203]:

geojson_path = 'Simulation2geoj.geojson'

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(geojson_path)

# Define the number of Monte Carlo simulation iterations
simulation_iterations = 1000

# Perform Monte Carlo simulation for buffering with random distances
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random buffer distance for each feature
    simulated_gdf['buffer_distance'] = np.random.uniform(50, 200, len(simulated_gdf))

    # Measure the time it takes to create buffered geometries with random distances
    start_time = time.time()
    simulated_gdf['buffered_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].buffer(row['buffer_distance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average buffering time
print(f"Average Buffering Time with Random Distances: {average_elapsed_time} seconds")


Average Buffering Time with Random Distances: 0.383662682056427 seconds


**Buffering using GeoPackage**

In [206]:
#Converting to GeoPackage

geojson_path = 'Simulation2geoj.geojson'
geopackage_path = 'Simulation 2/new_fileP.gpkg'

with fiona.open(geojson_path, 'r') as src:
    # Get the schema (including geometry type) from the GeoJSON file
    schema = src.schema
    # Open the GeoPackage file for writing, specifying the schema
    with fiona.open(geopackage_path, 'w', driver='GPKG', crs=src.crs, schema=schema) as dst:
        # Iterate through features in the GeoJSON file and write them to the GeoPackage file
        for feature in src:
            dst.write(feature)

In [208]:

geopackage_path = 'Simulation 2/new_fileP.gpkg'
gdf = gpd.read_file(geopackage_path)
simulation_iterations = 1000

# Perform Monte Carlo simulation for buffering with random distances
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random buffer distance for each feature
    simulated_gdf['buffer_distance'] = np.random.uniform(50, 200, len(simulated_gdf))

    # Measure the time it takes to create buffered geometries with random distances
    start_time = time.time()
    simulated_gdf['buffered_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].buffer(row['buffer_distance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average buffering time
print(f"Average Buffering Time with Random Distances: {average_elapsed_time} seconds")


Average Buffering Time with Random Distances: 0.3801724536418915 seconds


**Simplifying using Shapefile**

In [215]:
shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_crop.shp'
gdf = gpd.read_file(shapefile_path)
simulation_iterations = 1000

# Perform Monte Carlo simulation for geometry simplification
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random tolerance for geometry simplification
    simulated_gdf['simplify_tolerance'] = np.random.uniform(0.001, 0.1, len(simulated_gdf))

    # Measure the time it takes to simplify geometries with random tolerances
    start_time = time.time()
    simulated_gdf['simplified_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].simplify(row['simplify_tolerance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average simplification time
print(f"Average Simplification Time with Random Tolerances for Shapefiles: {average_elapsed_time} seconds")


Average Simplification Time with Random Tolerances for Shapefiles: 0.0007437136173248291 seconds


**Simplifying using GeoJSON**

In [220]:
#Converting to GeoJson 

shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_crop.shp'
gdf = gpd.read_file(shapefile_path)

geojson_output_path = 'Simulation3geoj.geojson'

gdf.to_file(geojson_output_path, driver='GeoJSON')

In [222]:
geojson_path = 'Simulation3geoj.geojson'
gdf = gpd.read_file(geojson_path)
simulation_iterations = 1000
total_elapsed_time = 0

for i in range(simulation_iterations):
    simulated_gdf = gdf.copy()

    
    simulated_gdf['simplify_tolerance'] = np.random.uniform(0.001, 0.1, len(simulated_gdf))

    # Measure the time it takes to simplify geometries with random tolerances
    start_time = time.time()
    simulated_gdf['simplified_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].simplify(row['simplify_tolerance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average simplification time
print(f"Average Simplification Time with Random Tolerances using GeoJSON: {average_elapsed_time} seconds")


Average Simplification Time with Random Tolerances using GeoJSON: 0.0007203111648559571 seconds


**Simplifying using GeoPackage**

In [227]:
#Converting to GeoPackage

geojson_path = 'Simulation3geoj.geojson'
geopackage_path = 'Simulation 3/new_fileP.gpkg'

with fiona.open(geojson_path, 'r') as src:
    # Get the schema (including geometry type) from the GeoJSON file
    schema = src.schema
    # Open the GeoPackage file for writing, specifying the schema
    with fiona.open(geopackage_path, 'w', driver='GPKG', crs=src.crs, schema=schema) as dst:
        # Iterate through features in the GeoJSON file and write them to the GeoPackage file
        for feature in src:
            dst.write(feature)

In [231]:
# Path to your GeoPackage file
geopackage_path = 'Simulation 3/new_fileP.gpkg'
gdf = gpd.read_file(geopackage_path)
simulation_iterations = 1000
total_elapsed_time = 0

for i in range(simulation_iterations):
    # Create a copy of the original GeoDataFrame for simulation
    simulated_gdf = gdf.copy()

    # Simulate a random tolerance for geometry simplification
    simulated_gdf['simplify_tolerance'] = np.random.uniform(0.001, 0.1, len(simulated_gdf))

    # Measure the time it takes to simplify geometries with random tolerances
    start_time = time.time()
    simulated_gdf['simplified_geometry'] = simulated_gdf.apply(lambda row: row['geometry'].simplify(row['simplify_tolerance']), axis=1)
    elapsed_time = time.time() - start_time

    # Accumulate the elapsed time for each iteration
    total_elapsed_time += elapsed_time

# Calculate the average time taken over all iterations
average_elapsed_time = total_elapsed_time / simulation_iterations

# Display the average simplification time
print(f"Average Simplification Time with Random Tolerances: {average_elapsed_time} seconds")


Average Simplification Time with Random Tolerances: 0.0007249534130096436 seconds


**Spatial Indexing with Shapefile**

In [235]:
shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_crop2.shp'
gdf = gpd.read_file(shapefile_path)
gdf_spatial_index = gdf.sindex

simulation_iterations = 1000

total_elapsed_time_with_index = 0
total_elapsed_time_without_index = 0

for i in range(simulation_iterations):
    # Simulate a random bounding box for the spatial query
    bounding_box = gdf.total_bounds + np.random.uniform(-1, 1, 4)

    # Measure the time it takes to perform a spatial query with spatial indexing
    start_time_with_index = time.time()
    result_with_index = gdf.iloc[list(gdf_spatial_index.intersection(bounding_box))]
    elapsed_time_with_index = time.time() - start_time_with_index
    total_elapsed_time_with_index += elapsed_time_with_index

    # Measure the time it takes to perform a spatial query without spatial indexing
    start_time_without_index = time.time()
    result_without_index = gdf.cx[bounding_box[0]:bounding_box[2], bounding_box[1]:bounding_box[3]]
    elapsed_time_without_index = time.time() - start_time_without_index
    total_elapsed_time_without_index += elapsed_time_without_index

# Calculate the average time taken over all iterations for both cases
average_elapsed_time_with_index = total_elapsed_time_with_index / simulation_iterations
average_elapsed_time_without_index = total_elapsed_time_without_index / simulation_iterations

# Display the average spatial query times
print(f"Average Spatial Query Time with Spatial Indexing: {average_elapsed_time_with_index} seconds")
print(f"Average Spatial Query Time without Spatial Indexing: {average_elapsed_time_without_index} seconds")


Average Spatial Query Time with Spatial Indexing: 0.0009335739612579346 seconds
Average Spatial Query Time without Spatial Indexing: 0.0010215349197387695 seconds


**Spatial Indexing with GeoJSON**

In [238]:
shapefile_path = 'Decision Analytics/california/neon-sjer-site/vector_data/SJER_crop2.shp'
gdf = gpd.read_file(shapefile_path)

geojson_output_path = 'Simulation4geoj.geojson'

gdf.to_file(geojson_output_path, driver='GeoJSON')

In [242]:
geojson_path = 'Simulation4geoj.geojson'
gdf = gpd.read_file(geojson_path)
simulation_iterations = 1000
total_elapsed_time_with_index = 0
total_elapsed_time_without_index = 0

for i in range(simulation_iterations):
    # Simulate a random bounding box for the spatial query
    bounding_box = gdf.total_bounds + np.random.uniform(-1, 1, 4)

    # Measure the time it takes to perform a spatial query with spatial indexing
    start_time_with_index = time.time()
    result_with_index = gdf.iloc[list(gdf_spatial_index.intersection(bounding_box))]
    elapsed_time_with_index = time.time() - start_time_with_index
    total_elapsed_time_with_index += elapsed_time_with_index

    # Measure the time it takes to perform a spatial query without spatial indexing
    start_time_without_index = time.time()
    result_without_index = gdf.cx[bounding_box[0]:bounding_box[2], bounding_box[1]:bounding_box[3]]
    elapsed_time_without_index = time.time() - start_time_without_index
    total_elapsed_time_without_index += elapsed_time_without_index

# Calculate the average time taken over all iterations for both cases
average_elapsed_time_with_index = total_elapsed_time_with_index / simulation_iterations
average_elapsed_time_without_index = total_elapsed_time_without_index / simulation_iterations

# Display the average spatial query times
print(f"Average Spatial Query Time with Spatial Indexing: {average_elapsed_time_with_index} seconds")
print(f"Average Spatial Query Time without Spatial Indexing: {average_elapsed_time_without_index} seconds")


Average Spatial Query Time with Spatial Indexing: 0.0009503364562988281 seconds
Average Spatial Query Time without Spatial Indexing: 0.0010596432685852052 seconds


**Spatial Indexing with GeoPackage**

In [240]:
#Converting to GeoPackage

geojson_path = 'Simulation4geoj.geojson'
geopackage_path = 'Simulation 4/new_fileP.gpkg'

with fiona.open(geojson_path, 'r') as src:
    # Get the schema (including geometry type) from the GeoJSON file
    schema = src.schema
    # Open the GeoPackage file for writing, specifying the schema
    with fiona.open(geopackage_path, 'w', driver='GPKG', crs=src.crs, schema=schema) as dst:
        # Iterate through features in the GeoJSON file and write them to the GeoPackage file
        for feature in src:
            dst.write(feature)

In [244]:
geopackage_path = 'Simulation 4/new_fileP.gpkg'
gdf = gpd.read_file(geopackage_path)
simulation_iterations = 1000
total_elapsed_time_with_index = 0
total_elapsed_time_without_index = 0

for i in range(simulation_iterations):
    # Simulate a random bounding box for the spatial query
    bounding_box = gdf.total_bounds + np.random.uniform(-1, 1, 4)

    # Measure the time it takes to perform a spatial query with spatial indexing
    start_time_with_index = time.time()
    result_with_index = gdf.iloc[list(gdf_spatial_index.intersection(bounding_box))]
    elapsed_time_with_index = time.time() - start_time_with_index
    total_elapsed_time_with_index += elapsed_time_with_index

    # Measure the time it takes to perform a spatial query without spatial indexing
    start_time_without_index = time.time()
    result_without_index = gdf.cx[bounding_box[0]:bounding_box[2], bounding_box[1]:bounding_box[3]]
    elapsed_time_without_index = time.time() - start_time_without_index
    total_elapsed_time_without_index += elapsed_time_without_index

# Calculate the average time taken over all iterations for both cases
average_elapsed_time_with_index = total_elapsed_time_with_index / simulation_iterations
average_elapsed_time_without_index = total_elapsed_time_without_index / simulation_iterations

# Display the average spatial query times
print(f"Average Spatial Query Time with Spatial Indexing: {average_elapsed_time_with_index} seconds")
print(f"Average Spatial Query Time without Spatial Indexing: {average_elapsed_time_without_index} seconds")


Average Spatial Query Time with Spatial Indexing: 0.0009242053031921387 seconds
Average Spatial Query Time without Spatial Indexing: 0.0010137207508087158 seconds
