In [2]:
# import some pkg
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.wkt import loads
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import create_engine, text
import psycopg2
import psycopg2.extras
import json
import folium
from branca.colormap import linear
import requests
import time
import tqdm
from shapely.wkb import loads as load_wkb


In [3]:
# some SRID
GAD2020_SRID = 7844
GDA94_SRID = 4283
WGS84_SRID = 4326

In [4]:
# utils

def data_summary(df):
    print(df.info())
    print(df.head())

def shapely_to_WKT(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)

def check_unique(df, key):
    return df[key].nunique() == len(df[key])

# Task 1 Import all datasets, clean if needed

In [5]:
#SA2 raw data
sa2_data = gpd.read_file("SA2_2021_AUST_GDA2020.shp")
data_summary(sa2_data)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2473 entries, 0 to 2472
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   SA2_CODE21  2473 non-null   object  
 1   SA2_NAME21  2473 non-null   object  
 2   CHG_FLAG21  2473 non-null   object  
 3   CHG_LBL21   2473 non-null   object  
 4   SA3_CODE21  2473 non-null   object  
 5   SA3_NAME21  2473 non-null   object  
 6   SA4_CODE21  2473 non-null   object  
 7   SA4_NAME21  2473 non-null   object  
 8   GCC_CODE21  2473 non-null   object  
 9   GCC_NAME21  2473 non-null   object  
 10  STE_CODE21  2473 non-null   object  
 11  STE_NAME21  2473 non-null   object  
 12  AUS_CODE21  2473 non-null   object  
 13  AUS_NAME21  2473 non-null   object  
 14  AREASQKM21  2454 non-null   float64 
 15  LOCI_URI21  2473 non-null   object  
 16  geometry    2454 non-null   geometry
dtypes: float64(1), geometry(1), object(15)
memory usage: 328.6+ KB
None
  SA2_CODE21  

In [6]:
print(sa2_data.crs)

EPSG:7844


In [7]:
#SA2 data
sa2_data = gpd.read_file("SA2_2021_AUST_GDA2020.shp")
# cleanning SA2 dropping unnesserary columns
sa2_greater_sydney = sa2_data[sa2_data["GCC_NAME21"] == "Greater Sydney"] # filtering
sa2_greater_sydney = sa2_greater_sydney[['SA2_CODE21', 'SA2_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'AREASQKM21', 'geometry']] # keeping nessary columns
# rename columns
sa2_greater_sydney.rename(columns={
    "SA2_CODE21": "sa2_id", # primary key
    "SA2_NAME21": "sa2_name",
    "SA4_CODE21": "sa4_id",
    "SA4_NAME21": "sa4_name",
    "AREASQKM21": "sa2_area_sqkm",
    'geometry': "geom"
}, inplace= True)
sa2_greater_sydney["sa2_id"] = sa2_greater_sydney["sa2_id"].astype('int64')
sa2_greater_sydney['geom'] = sa2_greater_sydney['geom'].apply(lambda x: shapely_to_WKT(geom=x,srid=GDA94_SRID))  # convert shapely to WKT
sa2_greater_sydney.dropna(inplace=True) # drop NA values
data_summary(sa2_greater_sydney)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 373 entries, 28 to 641
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sa2_id         373 non-null    int64  
 1   sa2_name       373 non-null    object 
 2   sa4_id         373 non-null    object 
 3   sa4_name       373 non-null    object 
 4   sa2_area_sqkm  373 non-null    float64
 5   geom           373 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 20.4+ KB
None
       sa2_id                     sa2_name sa4_id       sa4_name  \
28  102011028     Avoca Beach - Copacabana    102  Central Coast   
29  102011029  Box Head - MacMasters Beach    102  Central Coast   
30  102011030              Calga - Kulnura    102  Central Coast   
31  102011031          Erina - Green Point    102  Central Coast   
32  102011032        Gosford - Springfield    102  Central Coast   

    sa2_area_sqkm                                         

In [10]:
# connecting postgresql
def pgconnect(db_schema="public"):
    host = 'localhost'
    db_user = 'postgres'
    db_pw = "qwertyuiop"
    default_db = "DATA2001"
    try:
        db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
        conn = db.connect()
        print('Connected successfully.')
    except Exception as e:
        print("Unable to connect to the database.")
        print(e)
        db, conn = None, None
    return db,conn
db, conn = pgconnect()

Connected successfully.


In [12]:
create_tables = [
    """DROP TABLE IF EXISTS SA2 CASCADE;""",
    """CREATE TABLE IF NOT EXISTS SA2 (
        sa2_id INT PRIMARY KEY,
        sa2_name VARCHAR(255),
        sa4_id INT,
        sa4_name VARCHAR(255),
        sa2_area_sqkm NUMERIC(10, 4),
        geom GEOMETRY(MULTIPOLYGON,4283)
    );"""
]
for cmd in create_tables:
    with conn.begin():
        try:
            conn.execute(text(cmd))
            print(cmd, "executed")
        except Exception as e:
            conn.rollback()
            print(e)
            break

DROP TABLE IF EXISTS SA2 CASCADE; executed
CREATE TABLE IF NOT EXISTS SA2 (
        sa2_id INT PRIMARY KEY,
        sa2_name VARCHAR(255),
        sa4_id INT,
        sa4_name VARCHAR(255),
        sa2_area_sqkm NUMERIC(10, 4),
        geom GEOMETRY(MULTIPOLYGON,4283)
    ); executed


In [13]:
# insert sa2 data to table
sa2_greater_sydney.to_sql('sa2', conn, if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', GDA94_SRID)})

373

# Task 2 Utilise the NSW Points of Interest API to extract information relevant to each SA2 region and form our additional dataset

In [14]:
# a function that returns all points of interests from the API within a specified bounding box of coordinates
def POIinBbox(bbox):
    baseURL = 'https://maps.six.nsw.gov.au/arcgis/rest/services/public/NSW_POI/MapServer/0/query'
    x_min, y_min, x_max, y_max = bbox
    params = {
        'geometry': f'"xmin":{x_min},"ymin":{y_min},"xmax":{x_max},"ymax:{y_max}"',
        'outFields': '*',
        'returnGeometry': 'true',
        'f': 'json'
    }
    response = requests.get(baseURL, params)
    return json.loads(response.text)['features']

In [15]:
# get all SA2 in selected SA4
target_sa4 = "Sydney - North Sydney and Hornsby"
query = text("""
    SELECT sa2_id, geom, ST_XMin(geom) as min_x, ST_YMin(geom) as min_y, ST_XMax(geom) as max_x, ST_YMax(geom) as max_y
    FROM SA2
    WHERE sa4_name = :sa4_name
""")
try:
    r = conn.execute(query, {'sa4_name':target_sa4})
    results = r.fetchall()
except Exception as e:
    print(e)

In [16]:
# for each sa2, use the API to get POI and create a new table to store the data

poi_data = {
    'topoid': [], #primary key
    'sa2_id': [], #foreign key
    'poitype': []
}

for row in tqdm.tqdm(results):
    sa2_id = row[0]
    geom = load_wkb(row[1], hex=True)
    pois = POIinBbox(row[2:])
    for poi in pois:
        poi_geom = Point(poi['geometry']['x'], poi['geometry']['y'])
        if geom.contains(poi_geom): #check if poi is inside the SA2(we use bounding box to get the poi)
            poi_data['topoid'].append(poi['attributes']['topoid'])
            poi_data['sa2_id'].append(sa2_id)
            poi_data['poitype'].append(poi['attributes']['poitype'])
        else:
            continue
    time.sleep(1)

100%|██████████| 26/26 [00:32<00:00,  1.27s/it]


In [17]:
poi_df = pd.DataFrame(poi_data)
poi_df.rename(columns={
    "topoid": "poi_id", # primary key
    "poitype": "poi_type",
}, inplace= True)
data_summary(poi_df)
check_unique(poi_df, 'poi_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2367 entries, 0 to 2366
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   poi_id    2367 non-null   int64 
 1   sa2_id    2367 non-null   int64 
 2   poi_type  2367 non-null   object
dtypes: int64(2), object(1)
memory usage: 55.6+ KB
None
      poi_id     sa2_id          poi_type
0  500174886  121011399  Place Of Worship
1  500174997  121011399              Park
2  500186660  121011399              Park
3  500186948  121011399      Sports Field
4  500186982  121011399  Place Of Worship


True

In [21]:
# create a new tabel to store poi data
create_tables = [
    """DROP TABLE IF EXISTS POI CASCADE;""",
    """CREATE TABLE IF NOT EXISTS POI (
        poi_id INT PRIMARY KEY,
        poi_type VARCHAR(255),
        sa2_id INT
    );"""
]
for cmd in create_tables:
    try:
        conn.execute(text(cmd))
        print(cmd, "executed")
    except Exception as e:
        conn.rollback()
        print(e)
        break

   

DROP TABLE IF EXISTS POI CASCADE; executed
CREATE TABLE IF NOT EXISTS POI (
        poi_id INT PRIMARY KEY,
        poi_type VARCHAR(255),
        sa2_id INT
    ); executed


In [22]:
poi_df.to_sql('poi', conn, if_exists='append', index=False)

367