In [1]:
import os, requests, time, json, glob
from io import StringIO
from datetime import date, timedelta 
import matplotlib.pyplot as plt

import urllib.parse
from urllib.parse import urlparse

import pandas as pd
from pandas import json_normalize
import geopandas as gpd

import psycopg2
from sqlalchemy import create_engine, Column, Integer, Float, String, DateTime

from geoalchemy2 import Geometry, WKTElement

import warnings
warnings.filterwarnings("ignore")

### Set the instance PostgreSQL to Neon.Tech

In [2]:
connection_string = f''

url = urlparse(connection_string)

db_host = url.hostname
db_port = url.port
db_name = url.path[1:]
db_user = url.username
db_password = url.password

In [4]:
def connect_to_db():
    connection = None
    try:
        connection = psycopg2.connect(
            host=db_host,
            port=db_port,
            dbname=db_name,
            user=db_user,
            password=db_password
        )
        print("Connected to the PostgreSQL instance!")
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error while connecting to PostgreSQL", error)

    return connection

In [5]:
def execute_query(query):
    connection = connect_to_db()

    if connection is not None:
        try:
            cursor = connection.cursor()
            cursor.execute(query)
            result = cursor.fetchall()
            print("Query result:", result)
        except (Exception, psycopg2.DatabaseError) as error:
            print("Error while executing query", error)
        finally:
            if connection is not None:
                connection.close()
                print("Connection closed.")

In [6]:
connection = connect_to_db()

Connected to the PostgreSQL instance!


### Our connection to the database has been made

In [28]:
# Example query to get all rows from a table named "your_table_name"
example_query = "SELECT * FROM geodata;"
execute_query(example_query)

Connected to the PostgreSQL instance!
Error while executing query relation "geodata" does not exist
LINE 1: SELECT * FROM geodata;
                      ^

Connection closed.


### Configure the PostgreSQL instance to handle PostGIS + other extensions

In [8]:
def enable_postgis_extensions(connection):
    try:
        cursor = connection.cursor()
        cursor.execute("CREATE EXTENSION IF NOT EXISTS postgis;")
        cursor.execute("CREATE EXTENSION IF NOT EXISTS postgis_topology;")
        cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;")
        cursor.execute("CREATE EXTENSION IF NOT EXISTS postgis_tiger_geocoder;")
        connection.commit()
        print("PostGIS-related extensions enabled.")
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error while enabling PostGIS-related extensions", error)

connection = connect_to_db()
if connection is not None:
    enable_postgis_extensions(connection)
    connection.close()


Connected to the PostgreSQL instance!
PostGIS-related extensions enabled.


### Write geoDataframe to PostgreSQL

In [30]:
entity_name = "tornado_tracks"

# Read your GeoDataFrame from a file or create it programmatically
gdf = gpd.read_file(f"data/{entity_name}.shp")

In [7]:
gdf = gpd.read_file(f"data/tornado_stats_final.csv")

ValueError: GeoDataFrame does not support multiple columns using the geometry column name 'geometry'.

In [None]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 67558 entries, 0 to 67557
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   OBJECTID    67558 non-null  float64 
 1   om          67558 non-null  float64 
 2   yr          67558 non-null  float64 
 3   mo          67558 non-null  float64 
 4   dy          67558 non-null  float64 
 5   date        67558 non-null  object  
 6   time        67558 non-null  object  
 7   tz          67558 non-null  float64 
 8   st          67558 non-null  object  
 9   stf         67558 non-null  float64 
 10  stn         67558 non-null  float64 
 11  mag         67558 non-null  float64 
 12  inj         67558 non-null  float64 
 13  fat         67558 non-null  float64 
 14  loss        67558 non-null  float64 
 15  closs       67558 non-null  float64 
 16  slat        67558 non-null  float64 
 17  slon        67558 non-null  float64 
 18  elat        67558 non-null  float64 
 

In [16]:
cursor = connection.cursor()

query = '''
SELECT *
FROM tornado_tracks
'''

cursor.execute(query)
data = cursor.fetchall()

# Get the column names from the cursor description
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame from the fetched data and column names
df = pd.DataFrame(data, columns=columns)

# Get the number of rows and columns in the DataFrame
print(f"The DataFrame has {df.shape[0]} rows and {df.shape[1]} columns")

# Calculate the mean value of the 'length' column
print(f"The mean length of tornado tracks is {df['len'].mean()} miles")

# Show the first 10 rows of the DataFrame
df.head(10)


The DataFrame has 66244 rows and 27 columns
The mean length of tornado tracks is 3.4689402813849433 miles


Unnamed: 0,FID,om,yr,mo,dy,date,time,tz,st,stf,...,slon,elat,elon,len,wid,fc,Shape__Len,Month_Calc,Date_Calc,geometry
0,1.0,134.0,1951.0,6.0,9.0,1951-06-09,05:30:00,3.0,MS,28.0,...,-90.72,32.9001,-90.7199,2.0,10.0,1.0,17.311948,5.0,-585864000000.0,0102000020E610000002000000AE47E17A14AE56C03333...
1,2.0,135.0,1951.0,6.0,9.0,1951-06-09,17:00:00,3.0,NC,37.0,...,-78.33,36.03,-78.28,3.8,30.0,0.0,6930.119821,5.0,-585864000000.0,0102000020E61000000200000085EB51B81E9553C00000...
2,3.0,189.0,1951.0,7.0,13.0,1951-07-13,20:00:00,3.0,NE,31.0,...,-102.58,41.2301,-102.5799,0.1,10.0,0.0,18.520593,6.0,-582926400000.0,0102000020E61000000200000085EB51B81EA559C03D0A...
3,4.0,190.0,1951.0,7.0,15.0,1951-07-15,16:20:00,3.0,OK,40.0,...,-94.8,34.8001,-94.7999,0.1,100.0,0.0,17.541394,6.0,-582753600000.0,0102000020E6100000020000003333333333B357C06666...
4,5.0,191.0,1951.0,7.0,15.0,1951-07-15,16:20:00,3.0,OK,40.0,...,-94.8,34.8001,-94.7999,0.1,100.0,0.0,17.541394,6.0,-582753600000.0,0102000020E6100000020000003333333333B357C06666...
5,6.0,192.0,1951.0,7.0,15.0,1951-07-15,16:20:00,3.0,OK,40.0,...,-94.8,34.8001,-94.7999,0.1,100.0,0.0,17.541394,6.0,-582753600000.0,0102000020E6100000020000003333333333B357C06666...
6,7.0,193.0,1951.0,7.0,17.0,1951-07-17,17:45:00,3.0,IA,19.0,...,-95.87,43.1801,-95.8699,0.1,10.0,0.0,18.89354,6.0,-582580800000.0,0102000020E61000000200000048E17A14AEF757C0D7A3...
7,8.0,194.0,1951.0,7.0,20.0,1951-07-20,18:15:00,3.0,SD,46.0,...,-96.08,44.9001,-96.0799,1.0,10.0,0.0,19.258743,6.0,-582321600000.0,0102000020E61000000200000085EB51B81E0558C03333...
8,9.0,195.0,1951.0,7.0,20.0,1951-07-20,21:00:00,3.0,MN,27.0,...,-93.5,44.88,-93.27,11.6,10.0,0.0,26782.339413,6.0,-582321600000.0,0102000020E61000000200000000000000006057C0D7A3...
9,10.0,196.0,1951.0,7.0,21.0,1951-07-21,11:00:00,3.0,PA,42.0,...,-76.47,41.8001,-76.4699,0.1,10.0,0.0,18.625395,6.0,-582235200000.0,0102000020E610000002000000AE47E17A141E53C06666...


In [15]:
# define SQL query
query = "SELECT *,  ST_AsText(geometry) AS wkt_geom FROM tornado_tracks;"

# read data into GeoDataFrame
gdf = gpd.read_postgis(query, connection, geom_col="wkt_geom")

# convert wkt_geom column from string to geometry type
gdf["wkt_geom"] = gdf["wkt_geom"].apply(lambda x: shapely.wkt.loads(x))





WKBReadingError: Could not create geometry because of errors while reading input.

In [8]:
connection = connect_to_db()
cursor = connection.cursor()

query = '''
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
'''

cursor.execute(query)
tables = cursor.fetchall()

for table in tables:
    print(table[0])


Connected to the PostgreSQL instance!
spatial_ref_sys
geography_columns
geometry_columns
myspatialtable
table_name
tornado_tracks
geodata
"Table_Name"
Table_Name


### Function: Fix Invalid Geometries (pre-processing)

In [17]:
import geopandas as gpd

# Test for invalid geometries in a GeoDataFrame
invalid_geom = gdf[~gdf.is_valid]
if not invalid_geom.empty:
    print('Invalid geometries found')

Invalid geometries found


In [18]:
def fix_invalid_geometries(gdf, buffer_distance=0.0001):
    # Find invalid geometries
    invalid_geom = gdf[~gdf.is_valid]
    if invalid_geom.empty:
        return gdf

    # Fix invalid geometries by buffering with a small distance
    gdf.loc[~gdf.is_valid, 'geometry'] = invalid_geom.buffer(buffer_distance)

    # Re-test for invalid geometries and recursively fix them if necessary
    return fix_invalid_geometries(gdf)

fixed_gdf = fix_invalid_geometries(gdf)


KeyboardInterrupt: 

In [20]:
import geopandas as gpd
import dask.dataframe as dd
import dask_geopandas as dask_gpd
from shapely.geometry import shape, mapping
#from shapely.ops import make_valid


ImportError: cannot import name 'make_valid' from 'shapely.ops' (/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/shapely/ops.py)

In [25]:
def fix_invalid_geometry(geometry):
    if geometry.is_valid:
        return geometry
    else:
        return geometry.buffer(0.0001)

In [26]:
# Read the data with GeoPandas
# Convert GeoPandas GeoDataFrame to Dask-GeoPandas GeoDataFrame
dgdf = dask_gpd.from_geopandas(gdf, npartitions=4)

In [27]:
# Apply the fix_invalid_geometry function in a distributed way using Dask
dgdf['geometry'] = dgdf['geometry'].map_partitions(lambda part: part.apply(fix_invalid_geometry), meta=('geometry', 'object'))

In [31]:
dgdf['geometry']

Dask GeoSeries Structure:
npartitions=4
0        geometry
16561         ...
33122         ...
49683         ...
66243         ...
Name: geometry, dtype: geometry
Dask Name: getitem, 5 graph layers

In [28]:
# Compute the results
fixed_geometries_gdf = dgdf.compute()

# Save the fixed geometries to a new file
#fixed_geometries_gdf.to_file("fixed_geometries.geojson", driver="GeoJSON")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/n4/p2lmf2l94x3gfqhs72xx2zwr0000gn/T/ipykernel_53149/1100382602.py", line 2, in <cell line: 2>
    fixed_geometries_gdf = dgdf.compute()
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/dask/base.py", line 314, in compute
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/dask/base.py", line 599, in compute
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/dask/threaded.py", line 89, in get
    **kwargs,
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/dask/local.py", line 511, in get_async
    finish_task(dsk, key, state, results, keyorder.get)
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/dask/local.py", line 319, in reraise
    def identity(x):
  File "