In [47]:
# import some pkg
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.wkt import loads
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import create_engine, text
import psycopg2
import psycopg2.extras
import json
import folium
from branca.colormap import linear
import requests
import time
import tqdm
from shapely.wkb import loads as load_wkb


In [48]:
# some SRID
GAD2020_SRID = 7844
GDA94_SRID = 4283
WGS84_SRID = 4326

In [49]:
# utils

def data_summary(df):
    print(df.info())
    print(df.head())

def shapely_to_WKT(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)

def check_unique(df, key):
    return df[key].nunique() == len(df[key])

# Task 1 Import all datasets, clean if needed

In [50]:
#SA2 raw data
sa2_data = gpd.read_file("SA2_2021_AUST_GDA2020.shp")
data_summary(sa2_data)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2473 entries, 0 to 2472
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   SA2_CODE21  2473 non-null   object  
 1   SA2_NAME21  2473 non-null   object  
 2   CHG_FLAG21  2473 non-null   object  
 3   CHG_LBL21   2473 non-null   object  
 4   SA3_CODE21  2473 non-null   object  
 5   SA3_NAME21  2473 non-null   object  
 6   SA4_CODE21  2473 non-null   object  
 7   SA4_NAME21  2473 non-null   object  
 8   GCC_CODE21  2473 non-null   object  
 9   GCC_NAME21  2473 non-null   object  
 10  STE_CODE21  2473 non-null   object  
 11  STE_NAME21  2473 non-null   object  
 12  AUS_CODE21  2473 non-null   object  
 13  AUS_NAME21  2473 non-null   object  
 14  AREASQKM21  2454 non-null   float64 
 15  LOCI_URI21  2473 non-null   object  
 16  geometry    2454 non-null   geometry
dtypes: float64(1), geometry(1), object(15)
memory usage: 328.6+ KB
None
  SA2_CODE21  

In [51]:
# cleanning SA2 dropping unnesserary columns
sa2_greater_sydney = sa2_data[sa2_data["GCC_NAME21"] == "Greater Sydney"] # filtering
sa2_greater_sydney = sa2_greater_sydney[['SA2_CODE21', 'SA2_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'AREASQKM21', 'geometry']] # keeping nessary columns
# rename columns
sa2_greater_sydney.rename(columns={
    "SA2_CODE21": "sa2_id", # primary key
    "SA2_NAME21": "sa2_name",
    "SA4_CODE21": "sa4_id",
    "SA4_NAME21": "sa4_name",
    "AREASQKM21": "sa2_area_sqkm",
    'geometry': "geom"
}, inplace= True)
sa2_greater_sydney["sa2_id"] = sa2_greater_sydney["sa2_id"].astype('int64')
sa2_greater_sydney['geom'] = sa2_greater_sydney['geom'].apply(lambda x: shapely_to_WKT(geom=x,srid=GDA94_SRID))  # convert shapely to WKT
sa2_greater_sydney.dropna(inplace=True) # drop NA values
data_summary(sa2_greater_sydney)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 373 entries, 28 to 641
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sa2_id         373 non-null    int64  
 1   sa2_name       373 non-null    object 
 2   sa4_id         373 non-null    object 
 3   sa4_name       373 non-null    object 
 4   sa2_area_sqkm  373 non-null    float64
 5   geom           373 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 20.4+ KB
None
       sa2_id                     sa2_name sa4_id       sa4_name  \
28  102011028     Avoca Beach - Copacabana    102  Central Coast   
29  102011029  Box Head - MacMasters Beach    102  Central Coast   
30  102011030              Calga - Kulnura    102  Central Coast   
31  102011031          Erina - Green Point    102  Central Coast   
32  102011032        Gosford - Springfield    102  Central Coast   

    sa2_area_sqkm                                         

In [52]:
# businesses raw data
businesses = pd.read_csv("Businesses.csv")
data_summary(businesses)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12217 entries, 0 to 12216
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   industry_code           12217 non-null  object
 1   industry_name           12217 non-null  object
 2   sa2_code                12217 non-null  int64 
 3   sa2_name                12217 non-null  object
 4   0_to_50k_businesses     12217 non-null  int64 
 5   50k_to_200k_businesses  12217 non-null  int64 
 6   200k_to_2m_businesses   12217 non-null  int64 
 7   2m_to_5m_businesses     12217 non-null  int64 
 8   5m_to_10m_businesses    12217 non-null  int64 
 9   10m_or_more_businesses  12217 non-null  int64 
 10  total_businesses        12217 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 1.0+ MB
None
  industry_code                      industry_name   sa2_code  \
0             A  Agriculture, Forestry and Fishing  101021007   
1             A  Agriculture, 

In [53]:
# cleaning businesses data
businesses = businesses[['industry_name', 'sa2_code', 'total_businesses']]
# rename columns
businesses.rename(columns={
    "sa2_code": "sa2_id", # foreign key
}, inplace= True)
businesses = businesses[businesses['sa2_id'].isin(sa2_greater_sydney['sa2_id'])] #filter down to the ”Greater Sydney” GCC
businesses.dropna(inplace=True)
data_summary(businesses)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7087 entries, 28 to 12215
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   industry_name     7087 non-null   object
 1   sa2_id            7087 non-null   int64 
 2   total_businesses  7087 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 221.5+ KB
None
                        industry_name     sa2_id  total_businesses
28  Agriculture, Forestry and Fishing  102011028                 6
29  Agriculture, Forestry and Fishing  102011029                17
30  Agriculture, Forestry and Fishing  102011030               215
31  Agriculture, Forestry and Fishing  102011031                27
32  Agriculture, Forestry and Fishing  102011032                19


In [54]:
# catchments future raw data
catchments_future = gpd.read_file("catchments/catchments_future.shp")
data_summary(catchments_future)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   USE_ID      30 non-null     object  
 1   CATCH_TYPE  30 non-null     object  
 2   USE_DESC    30 non-null     object  
 3   ADD_DATE    30 non-null     object  
 4   KINDERGART  30 non-null     int64   
 5   YEAR1       30 non-null     int64   
 6   YEAR2       30 non-null     int64   
 7   YEAR3       30 non-null     int64   
 8   YEAR4       30 non-null     int64   
 9   YEAR5       30 non-null     int64   
 10  YEAR6       30 non-null     int64   
 11  YEAR7       30 non-null     int64   
 12  YEAR8       30 non-null     int64   
 13  YEAR9       30 non-null     int64   
 14  YEAR10      30 non-null     int64   
 15  YEAR11      30 non-null     int64   
 16  YEAR12      30 non-null     int64   
 17  geometry    30 non-null     geometry
dtypes: geometry(1), int64(13), object(4)
memory 

In [55]:
# cleaning catchments future raw data
catchments_future = catchments_future[["USE_ID", "CATCH_TYPE", "geometry"]] # drop unused columns
catchments_future.rename(columns={
    "USE_ID": "use_id",
    "CATCH_TYPE": "catch_type",
    "geometry": "geom"
}, inplace=True)

In [56]:
# catchments primary raw data
catchments_primary = gpd.read_file("catchments/catchments_primary.shp")
data_summary(catchments_primary)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1662 entries, 0 to 1661
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   USE_ID      1662 non-null   object  
 1   CATCH_TYPE  1662 non-null   object  
 2   USE_DESC    1662 non-null   object  
 3   ADD_DATE    1335 non-null   object  
 4   KINDERGART  1662 non-null   object  
 5   YEAR1       1662 non-null   object  
 6   YEAR2       1662 non-null   object  
 7   YEAR3       1662 non-null   object  
 8   YEAR4       1662 non-null   object  
 9   YEAR5       1662 non-null   object  
 10  YEAR6       1662 non-null   object  
 11  YEAR7       1662 non-null   object  
 12  YEAR8       1662 non-null   object  
 13  YEAR9       1662 non-null   object  
 14  YEAR10      1662 non-null   object  
 15  YEAR11      1662 non-null   object  
 16  YEAR12      1662 non-null   object  
 17  PRIORITY    4 non-null      object  
 18  geometry    1662 non-null   geometry
dty

In [57]:
# cleaning catchments primary raw data
catchments_primary = catchments_primary[["USE_ID", "CATCH_TYPE", "geometry"]] # drop unused columns
catchments_primary.rename(columns={
    "USE_ID": "use_id",
    "CATCH_TYPE": "catch_type",
    "geometry": "geom"
}, inplace=True)

In [58]:
# catchments secondary raw data
catchments_secondary = gpd.read_file("catchments/catchments_secondary.shp")
data_summary(catchments_secondary)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   USE_ID      436 non-null    object  
 1   CATCH_TYPE  436 non-null    object  
 2   USE_DESC    436 non-null    object  
 3   ADD_DATE    372 non-null    object  
 4   KINDERGART  436 non-null    object  
 5   YEAR1       436 non-null    object  
 6   YEAR2       436 non-null    object  
 7   YEAR3       436 non-null    object  
 8   YEAR4       436 non-null    object  
 9   YEAR5       436 non-null    object  
 10  YEAR6       436 non-null    object  
 11  YEAR7       436 non-null    object  
 12  YEAR8       436 non-null    object  
 13  YEAR9       436 non-null    object  
 14  YEAR10      436 non-null    object  
 15  YEAR11      436 non-null    object  
 16  YEAR12      436 non-null    object  
 17  PRIORITY    7 non-null      object  
 18  geometry    436 non-null    geometry
dtype

In [59]:
# cleaning catchments secondary raw data
catchments_secondary = catchments_secondary[["USE_ID", "CATCH_TYPE", "geometry"]] # drop unused columns
catchments_secondary.rename(columns={
    "USE_ID": "use_id",
    "CATCH_TYPE": "catch_type",
    "geometry": "geom"
}, inplace=True)

In [60]:
catchments = pd.concat([catchments_future, catchments_primary, catchments_secondary], axis=0)
data_summary(catchments)
check_unique(catchments, key='use_id')

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2128 entries, 0 to 435
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   use_id      2128 non-null   object  
 1   catch_type  2128 non-null   object  
 2   geom        2128 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 66.5+ KB
None
  use_id catch_type                                               geom
0   8416  HIGH_COED  POLYGON ((151.19849 -33.53990, 151.19945 -33.5...
1   8161  HIGH_BOYS  POLYGON ((151.27152 -33.91402, 151.27152 -33.9...
2   8539  HIGH_COED  POLYGON ((151.15292 -33.83939, 151.16144 -33.8...
3   8400  HIGH_COED  POLYGON ((151.17794 -33.69820, 151.17859 -33.6...
4   8555  HIGH_COED  POLYGON ((151.28072 -33.83287, 151.28095 -33.8...


False

In [61]:
catchments = catchments[['catch_type', 'geom']] #drop use_id as it's not unique
catchments['geom'] = catchments['geom'].apply(lambda x: shapely_to_WKT(geom=x,srid=GDA94_SRID))  # convert shapely to WKT
catchments.dropna(inplace=True)
data_summary(catchments)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2128 entries, 0 to 435
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   catch_type  2128 non-null   object
 1   geom        2128 non-null   object
dtypes: object(2)
memory usage: 49.9+ KB
None
  catch_type                                               geom
0  HIGH_COED  MULTIPOLYGON (((151.19848917708944 -33.5398987...
1  HIGH_BOYS  MULTIPOLYGON (((151.27151530428182 -33.9140183...
2  HIGH_COED  MULTIPOLYGON (((151.15292370935092 -33.8393921...
3  HIGH_COED  MULTIPOLYGON (((151.17793729938725 -33.6982001...
4  HIGH_COED  MULTIPOLYGON (((151.28072275958445 -33.8328728...


In [62]:
# income raw data
income = pd.read_csv("Income.csv")
data_summary(income)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   sa2_code21     642 non-null    int64 
 1   sa2_name       642 non-null    object
 2   earners        642 non-null    object
 3   median_age     642 non-null    object
 4   median_income  642 non-null    object
 5   mean_income    642 non-null    object
dtypes: int64(1), object(5)
memory usage: 30.2+ KB
None
   sa2_code21                         sa2_name earners median_age  \
0   101021007                        Braidwood    2467         51   
1   101021008                          Karabar    5103         42   
2   101021009                       Queanbeyan    7028         39   
3   101021010                Queanbeyan - East    3398         39   
4   101021012  Queanbeyan West - Jerrabomberra    8422         44   

  median_income mean_income  
0         46640       68904  
1         65564      

In [63]:
income = income[["sa2_code21", "median_age", "median_income", 'mean_income']] # drop unused columns
income.rename(columns={
    "sa2_code21": "sa2_id"
}, inplace=True)
income = income[income['sa2_id'].isin(sa2_greater_sydney['sa2_id'])] #filter down to the ”Greater Sydney” GCC
income['median_age'] = pd.to_numeric(income['median_age'], errors='coerce').fillna(-1).astype('int64')
income['median_income'] = pd.to_numeric(income['median_income'], errors='coerce').fillna(-1).astype('int64')
income['mean_income'] = pd.to_numeric(income['mean_income'], errors='coerce').fillna(-1).astype('int64')
income.dropna(inplace=True)

In [64]:
# population raw data
population = pd.read_csv("Population.csv")
data_summary(population)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sa2_code            373 non-null    int64 
 1   sa2_name            373 non-null    object
 2   0-4_people          373 non-null    int64 
 3   5-9_people          373 non-null    int64 
 4   10-14_people        373 non-null    int64 
 5   15-19_people        373 non-null    int64 
 6   20-24_people        373 non-null    int64 
 7   25-29_people        373 non-null    int64 
 8   30-34_people        373 non-null    int64 
 9   35-39_people        373 non-null    int64 
 10  40-44_people        373 non-null    int64 
 11  45-49_people        373 non-null    int64 
 12  50-54_people        373 non-null    int64 
 13  55-59_people        373 non-null    int64 
 14  60-64_people        373 non-null    int64 
 15  65-69_people        373 non-null    int64 
 16  70-74_people        373 no

In [65]:
population['young_people'] = population[['0-4_people','5-9_people', '10-14_people','15-19_people']].sum(axis=1)
population = population[["sa2_code", "young_people", "total_people"]] # drop unused columns
population.rename(columns={
    "sa2_code": "sa2_id"
}, inplace=True)
population = population[population['sa2_id'].isin(sa2_greater_sydney['sa2_id'])] #filter down to the ”Greater Sydney” GCC
population.dropna(inplace=True)
data_summary(population)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373 entries, 0 to 372
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   sa2_id        373 non-null    int64
 1   young_people  373 non-null    int64
 2   total_people  373 non-null    int64
dtypes: int64(3)
memory usage: 11.7 KB
None
      sa2_id  young_people  total_people
0  102011028          2121          7530
1  102011029          2471         11052
2  102011030           961          4748
3  102011031          3205         14803
4  102011032          4364         21346


In [66]:
# stops raw data
stops = pd.read_csv("Stops.txt")
data_summary(stops)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114718 entries, 0 to 114717
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   stop_id              114718 non-null  object 
 1   stop_code            60719 non-null   float64
 2   stop_name            114718 non-null  object 
 3   stop_lat             114718 non-null  float64
 4   stop_lon             114718 non-null  float64
 5   location_type        53991 non-null   float64
 6   parent_station       60727 non-null   object 
 7   wheelchair_boarding  114718 non-null  int64  
 8   platform_code        871 non-null     object 
dtypes: float64(4), int64(1), object(4)
memory usage: 7.9+ MB
None
  stop_id  stop_code                             stop_name   stop_lat  \
0  200039   200039.0     Central Station, Eddy Av, Stand A -33.882206   
1  200054   200054.0     Central Station, Eddy Av, Stand D -33.882042   
2  200060        NaN                       

In [67]:
stops['geom'] = gpd.points_from_xy(stops.stop_lon, stops.stop_lat) # convert to shapely object
stops['geom'] = stops['geom'].apply(lambda x: shapely_to_WKT(geom=x, srid=4326)) # convert shapely to WKT
stops = stops[["stop_id", "geom"]] # drop unused columns
stops.dropna(inplace=True)
data_summary(stops)
check_unique(stops, 'stop_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114718 entries, 0 to 114717
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   stop_id  114718 non-null  object
 1   geom     114718 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB
None
  stop_id                                        geom
0  200039   POINT (151.20666465471 -33.8822064874687)
1  200054   POINT (151.20699145565 -33.8820421431408)
2  200060  POINT (151.206292455081 -33.8840842535493)
3  201510  POINT (151.198866071817 -33.8916900512711)
4  201646  POINT (151.198881722942 -33.8933293130144)


True

In [68]:
# connecting postgresql
def pgconnect(db_schema="public"):
    host = 'localhost'
    db_user = 'postgres'
    db_pw = "qwertyuiop"
    default_db = "DATA2001"
    try:
        db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
        conn = db.connect()
        print('Connected successfully.')
    except Exception as e:
        print("Unable to connect to the database.")
        print(e)
        db, conn = None, None
    return db,conn
db, conn = pgconnect()

Connected successfully.


In [69]:
create_tables = [
    """DROP TABLE IF EXISTS SA2 CASCADE;""",
    """CREATE TABLE IF NOT EXISTS SA2 (
        sa2_id INT PRIMARY KEY,
        sa2_name VARCHAR(255),
        sa4_id INT,
        sa4_name VARCHAR(255),
        sa2_area_sqkm NUMERIC(10, 4),
        geom GEOMETRY(MULTIPOLYGON,4283)
    );"""

    """DROP TABLE IF EXISTS businesses CASCADE;""",
    """CREATE TABLE IF NOT EXISTS businesses (
        business_id SERIAL PRIMARY KEY,
        industry_name VARCHAR(255),
        sa2_id INT REFERENCES SA2(sa2_id),
        total_businesses INT
    );""",

    """DROP TABLE IF EXISTS catchments CASCADE;""",
    """CREATE TABLE IF NOT EXISTS catchments (
        catchment_id SERIAL PRIMARY KEY,
        catch_type VARCHAR(32),
        geom GEOMETRY(MULTIPOLYGON,4283)  
    );"""    

    """DROP TABLE IF EXISTS income CASCADE;""",
    """CREATE TABLE IF NOT EXISTS income (
        income_id SERIAL PRIMARY KEY,
        sa2_id INT REFERENCES SA2(sa2_id),    
        median_age INT,          
        median_income INT,   
        mean_income INT      
    );""",    

    """DROP TABLE IF EXISTS population CASCADE;""",
    """CREATE TABLE IF NOT EXISTS population (
        population_id SERIAL PRIMARY KEY,
        sa2_id INT REFERENCES SA2(sa2_id),
        young_people INT,
        total_people INT
    );""",    

    """DROP TABLE IF EXISTS stops CASCADE;""",
    """CREATE TABLE IF NOT EXISTS stops (
        stop_id VARCHAR(32) PRIMARY KEY,
        geom GEOMETRY(POINT,4326)
    );""",
]
for cmd in create_tables:
    with conn.begin():
        try:
            conn.execute(text(cmd))
            print(cmd, "executed")
        except Exception as e:
            conn.rollback()
            print(e)
            break

DROP TABLE IF EXISTS SA2 CASCADE; executed
CREATE TABLE IF NOT EXISTS SA2 (
        sa2_id INT PRIMARY KEY,
        sa2_name VARCHAR(255),
        sa4_id INT,
        sa4_name VARCHAR(255),
        sa2_area_sqkm NUMERIC(10, 4),
        geom GEOMETRY(MULTIPOLYGON,4283)
    );DROP TABLE IF EXISTS businesses CASCADE; executed
CREATE TABLE IF NOT EXISTS businesses (
        business_id SERIAL PRIMARY KEY,
        industry_name VARCHAR(255),
        sa2_id INT REFERENCES SA2(sa2_id),
        total_businesses INT
    ); executed
DROP TABLE IF EXISTS catchments CASCADE; executed
CREATE TABLE IF NOT EXISTS catchments (
        catchment_id SERIAL PRIMARY KEY,
        catch_type VARCHAR(32),
        geom GEOMETRY(MULTIPOLYGON,4283)  
    );DROP TABLE IF EXISTS income CASCADE; executed
CREATE TABLE IF NOT EXISTS income (
        income_id SERIAL PRIMARY KEY,
        sa2_id INT REFERENCES SA2(sa2_id),    
        median_age INT,          
        median_income INT,   
        mean_income INT     

In [70]:
# insert data to table
sa2_greater_sydney.to_sql('sa2', conn, if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', GDA94_SRID)})
businesses.to_sql('businesses', conn, if_exists='append', index=False )
catchments.to_sql('catchments', conn, if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', GDA94_SRID)})
income.to_sql('income', conn, if_exists='append', index=False)
population.to_sql('population', conn, if_exists='append', index=False)
stops.to_sql('stops', conn, if_exists='append', index=False, dtype={'geom': Geometry('POINT', WGS84_SRID)})

718

# Task 2 Utilise the NSW Points of Interest API to extract information relevant to each SA2 region and form our additional dataset

In [71]:
# a function that returns all points of interests from the API within a specified bounding box of coordinates
def POIinBbox(bbox):
    baseURL = 'https://maps.six.nsw.gov.au/arcgis/rest/services/public/NSW_POI/MapServer/0/query'
    x_min, y_min, x_max, y_max = bbox
    params = {
        'geometry': f'"xmin":{x_min},"ymin":{y_min},"xmax":{x_max},"ymax:{y_max}"',
        'outFields': '*',
        'returnGeometry': 'true',
        'f': 'json'
    }
    response = requests.get(baseURL, params)
    return json.loads(response.text)['features']

In [72]:
# get all SA2 in selected SA4
target_sa4 = "Sydney - North Sydney and Hornsby"
query = text("""
    SELECT sa2_id, geom, ST_XMin(geom) as min_x, ST_YMin(geom) as min_y, ST_XMax(geom) as max_x, ST_YMax(geom) as max_y
    FROM SA2
    WHERE sa4_name = :sa4_name
""")
try:
    r = conn.execute(query, {'sa4_name':target_sa4})
    results = r.fetchall()
except Exception as e:
    print(e)

In [73]:
# for each sa2, use the API to get POI and create a new table to store the data

poi_data = {
    'topoid': [], #primary key
    'sa2_id': [], #foreign key
    'poigroup': []
}

for row in tqdm.tqdm(results):
    sa2_id = row[0]
    geom = load_wkb(row[1], hex=True)
    pois = POIinBbox(row[2:])
    for poi in pois:
        poi_geom = Point(poi['geometry']['x'], poi['geometry']['y'])
        if geom.contains(poi_geom): #check if poi is inside the SA2(we use bounding box to get the poi)
            poi_data['topoid'].append(poi['attributes']['topoid'])
            poi_data['sa2_id'].append(sa2_id)
            poi_data['poigroup'].append(poi['attributes']['poigroup'])
        else:
            continue
    time.sleep(1)

100%|██████████| 26/26 [00:32<00:00,  1.27s/it]


In [74]:
poi_df = pd.DataFrame(poi_data)
poi_df.rename(columns={
    "topoid": "poi_id", # primary key
    "poigroup": "poi_group",
}, inplace= True)
data_summary(poi_df)
check_unique(poi_df, 'poi_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2367 entries, 0 to 2366
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   poi_id     2367 non-null   int64
 1   sa2_id     2367 non-null   int64
 2   poi_group  2367 non-null   int64
dtypes: int64(3)
memory usage: 55.6 KB
None
      poi_id     sa2_id  poi_group
0  500174886  121011399          1
1  500174997  121011399          3
2  500186660  121011399          3
3  500186948  121011399          3
4  500186982  121011399          1


True

In [76]:
# create a new tabel to store poi data
create_tables = [
    """DROP TABLE IF EXISTS POI CASCADE;""",
    """CREATE TABLE IF NOT EXISTS POI (
        poi_id INT PRIMARY KEY,
        poi_group INT,
        sa2_id INT REFERENCES SA2(sa2_id)
    );"""
]
for cmd in create_tables:
    try:
        conn.execute(text(cmd))
        print(cmd, "executed")
    except Exception as e:
        conn.rollback()
        print(e)
        break


DROP TABLE IF EXISTS POI CASCADE; executed
CREATE TABLE IF NOT EXISTS POI (
        poi_id INT PRIMARY KEY,
        poi_group INT,
        sa2_id INT REFERENCES SA2(sa2_id)
    ); executed


In [None]:
poi_df.to_sql('poi', conn, if_exists='append', index=False)

367