In [7]:
import os, requests, time, json, glob
from io import StringIO
from datetime import date, timedelta 
import matplotlib.pyplot as plt

import urllib.parse
from urllib.parse import urlparse

import pandas as pd
from pandas import json_normalize
import geopandas as gpd

import psycopg2
from sqlalchemy import create_engine, inspect, Column, Integer, Float, String, DateTime
from postgis import LineString
from postgis.psycopg import register
import psycopg2
from geoalchemy2 import Geometry, WKTElement

import warnings
warnings.filterwarnings("ignore")

### Set the instance PostgreSQL to Neon.Tech

#### Call secrets

In [3]:
database_user = os.environ["DATABASE_USER"]
database_password = os.environ["DATABASE_PASSWORD"]
database_host = os.environ["DATABASE_HOST"]
database_name = os.environ["DATABASE_NAME"]

In [4]:
connection_string = f"postgresql://{database_user}:{database_password}@{database_host}/{database_name}"

In [6]:
# Connect to the PostgreSQL instance
engine = create_engine(connection_string)

In [14]:
connection_string = f'{connection_string}'

url = urlparse(connection_string)

db_host = url.hostname
db_port = url.port
db_name = url.path[1:]
db_user = url.username
db_password = url.password

### Pull from API and write to PostgreSQL using PostGIS formats

In [4]:
entity = "tornado_history"

url = r'https://services2.arcgis.com/FiaPA4ga0iQKduv3/arcgis/rest/services/Tornado_Tracks_1950_2017_1/FeatureServer/0/query?'

params = {
    'where': '1=1',
    'geometryType': 'esriGeometryPolygon',
    'returnExceededLimitFeatures': 'false',
    'inSR': '4326',
    'units': 'esriSRUnit_Meter',
    'returnGeometry': 'true',
    'outFields': '*',
    'f': 'pgeojson',
    'resultOffset': '0',
    'resultRecordCount': '2000'
}

max_records = 660000

count_params = params.copy()
count_params['returnCountOnly'] = 'true'
count_url = url + urllib.parse.urlencode(count_params)
count_response = requests.get(count_url)
total_records = count_response.json()['properties']['count']

output_directory = f"/tornado/data/{entity}/"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def paginate_api(entity, max_records):
    num_pages = min(total_records, max_records) // 2000 + (1 if (min(total_records, max_records) % 2000) > 0 else 0)
    offsets = [i * 2000 for i in range(num_pages)]
    geo_dataframes = []

    for offset in offsets:
        params['resultOffset'] = str(offset)
        url_final = url + urllib.parse.urlencode(params)

        try:
            response = requests.get(url_final)
            data = response.text
            df = gpd.read_file(data)
            geo_dataframes.append(df)
            time.sleep(1)
            print(f'Fetched data for offset {offset}')
        except Exception as e:
            print(f'Error fetching data for offset {offset}: {e}')

    final_geo_dataframe = gpd.GeoDataFrame(pd.concat(geo_dataframes, ignore_index=True), crs=df.crs)

    engine = create_engine(connection_string)

    # Filter out missing geometries before converting to LineString
    final_geo_dataframe = final_geo_dataframe[final_geo_dataframe["geometry"].notnull()]
    final_geo_dataframe["geom"] = final_geo_dataframe["geometry"].apply(lambda x: LineString(x.coords))

    # Write the GeoDataFrame to the database using to_postgis function
    final_geo_dataframe.to_postgis(entity, engine, if_exists="replace", index=False)
    final_geo_dataframe.to_file(f"/tornado/data/input/{entity}.gpkg", driver="GPKG")

    print(f'Saved final GeoDataFrame with {len(final_geo_dataframe)} records to PostgreSQL')

if __name__ == '__main__':
    start_time = time.time()
    paginate_api(entity, total_records)
    print(f'Total time: {time.time() - start_time:.2f} seconds')


Fetched data for offset 0
Fetched data for offset 2000
Fetched data for offset 4000
Fetched data for offset 6000
Fetched data for offset 8000
Fetched data for offset 10000
Fetched data for offset 12000
Fetched data for offset 14000
Fetched data for offset 16000
Fetched data for offset 18000
Fetched data for offset 20000
Fetched data for offset 22000
Fetched data for offset 24000
Fetched data for offset 26000
Fetched data for offset 28000
Fetched data for offset 30000
Fetched data for offset 32000
Fetched data for offset 34000
Fetched data for offset 36000
Fetched data for offset 38000
Fetched data for offset 40000
Fetched data for offset 42000
Fetched data for offset 44000
Fetched data for offset 46000
Fetched data for offset 48000
Fetched data for offset 50000
Fetched data for offset 52000
Fetched data for offset 54000
Fetched data for offset 56000
Fetched data for offset 58000
Fetched data for offset 60000
Fetched data for offset 62000
Fetched data for offset 64000
Fetched data for o