# DATA2001 Assignment - Assessing SA2 regions
### Authors: ykim4904, unikey2, unikey3

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
import psycopg2
import psycopg2.extras
import json

## Task 1 - Clean and Import datasets into SQL server

In [None]:
# Import all necessary datasets for the report + initial filtering
sa2_bounds_raw = gpd.read_file("space_data/SA2.shp")
businesses_raw = pd.read_csv("other_data/Businesses.csv")
stops_raw = pd.read_csv("other_data/Stops.txt")
polls_raw = pd.read_csv("other_data/PollingPlaces2019.csv")
schools_prima_raw = gpd.read_file("space_data/catchments/catchments_primary.shp")
schools_secon_raw = gpd.read_file("space_data/catchments/catchments_secondary.shp")
schools_futur_raw = gpd.read_file("space_data/catchments/catchments_future.shp")
populations_raw = pd.read_csv("other_data/Population.csv")
incomes_raw = pd.read_csv("other_data/Income.csv")

In [None]:
# database connection, querying functions

credentials = "Credentials.json"

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['database']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(sqlcmd, args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [None]:
# pd -> PostGIS suitable type conversino helper function
def create_wkt_element(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)
srid = 4326

In [None]:
db, conn = pgconnect(credentials)

In [None]:
from sqlalchemy import inspect
print(inspect(db).get_schema_names())
conn.execute("CREATE SCHEMA IF NOT EXISTS DATA2001_A_IEY;")
conn.execute("SET search_path TO DATA2001_A_IEY, public;")
print(query(conn, "SELECT PostGIS_version();"))


### Task 1.1.1 - Cleaning dataset : SA2 Regions' Boundaries

In [None]:
# Initial filtering (SA2_regions)
# 1. W're only interested in "Greater Sydney" GCC)
print(sa2_bounds_raw.GCC_NAME21.value_counts().head())
print("... more regions (truncated)\n")

sa2_bounds = sa2_bounds_raw[sa2_bounds_raw.GCC_NAME21 == "Greater Sydney"]
print(sa2_bounds.GCC_NAME21.value_counts(), "\n")
# 2. We'll only be conducting analysis on SA2 regions. We'll not be 
# examining ins encompassing(broader) regions such as SA3, SA4, and states. 
sa2_bounds = sa2_bounds.loc[:, ["SA2_CODE21", "SA2_NAME21", "AREASQKM21", "geometry"]]
# 3. rename columns
sa2_bounds = sa2_bounds.rename(
    columns = dict(SA2_CODE21="code", SA2_NAME21="name", AREASQKM21="area_sq_km"))
# 4. cast appropriately
sa2_bounds["code"] = sa2_bounds["code"].astype(int)
sa2_bounds["name"] = sa2_bounds["name"].astype(str)
sa2_bounds["geom"] = sa2_bounds['geometry'].apply(lambda x: create_wkt_element(geom=x,srid=srid))
sa2_bounds = sa2_bounds.drop(columns="geometry") 

### Task 1.1.2 - Cleaning datasets : Businesses, Train-Stops, and Polling-Places(2019)

In [None]:
businesses = businesses_raw
businesses = businesses = businesses.loc[:, ["industry_name", "sa2_code", "total_businesses"]]

In [None]:
stops = stops_raw
# filter out only those in greater sydney
stops = stops[((stops['stop_lon'] > 150.4) & (stops['stop_lon'] < 151.4))]
stops = stops[((stops['stop_lat'] > -34.2) & (stops['stop_lat'] < 33.5))]
stops = stops.loc[:, ["stop_id", "stop_name", "stop_lon", "stop_lat"]]
stops['stop_loc'] = gpd.points_from_xy(stops.stop_lon, stops.stop_lat)
stops["stop_loc"] = stops['stop_loc'].apply(lambda x: WKTElement(x.wkt, srid=srid))
stops = stops.drop(columns=["stop_lon", "stop_lat"])


In [None]:
polls = polls_raw
polls = polls.rename(columns=dict(polling_place_id="poll_id", polling_place_name="poll_name"))
polls = polls.loc[:, ["poll_id", "poll_name", "latitude", "longitude"]]
polls = polls.dropna()
polls["poll_loc"] = gpd.points_from_xy(polls.longitude, polls.latitude)
polls["poll_loc"] = polls['poll_loc'].apply(lambda x: WKTElement(x.wkt, srid=srid))
polls = polls.drop(columns=["longitude", "latitude"])

### Task 1.1.3 - Cleaning Datasets: Population and Income

In [None]:
population = populations_raw
population["young_people"] = population.iloc[:, 2:5+1].sum(axis=1)
population = population.loc[:, ["sa2_code", "young_people", "total_people"]]

income = incomes_raw
income = income.loc[:, ["sa2_code", "median_income"]]
income = income[income.median_income.str.startswith('np') == False]
income['median_income'] = income['median_income'].astype(int)

### Task 1.1.4 - Cleaning dataset: School Catchments

In [None]:
### TODO

### Task 1.1.5 - Cleaning Dataset (Task 3) : 

In [None]:
### TODO

### Task 1.1.6 - Cleaning Dataset (Task 3) :

In [None]:
### TODO

### Task 1.1.7 - Cleaning Dataset (Task 3) :

In [None]:
### TODO

### Task 1.1.8 - Verify Cleaning work

In [None]:
# print(sa2_bounds.info())
# sa2_bounds.head()
# print(businesses.info())
# businesses.head()
# print(stops.info())
# stops.head()
# print(polls.info())
# polls.head()
# print(population.info())
# population.head()
# print(income.info())
# income.head()

### Task 1.2 - Importing into PSQL server

In [None]:
schema_f = open("schema_init.sql", "r")
schema_definition = schema_f.read()
conn.execute(schema_definition)

sa2_bounds.to_sql('sa2_bounds', conn, if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', srid)})
businesses.to_sql('businesses', conn, if_exists='append', index=False)
stops.to_sql('stops', conn, if_exists='append', index=False, dtype={'stop_loc': Geometry('POINT', srid)})
polls.to_sql('polls', conn, if_exists='append', index=False, dtype={'poll_loc': Geometry('POINT', srid)})
population.to_sql('population', conn, if_exists='append', index=False)
income.to_sql('income', conn, if_exists='append', index=False)

## Task 2 - Scores

In [None]:
def set_z_score(df, column):
    df["z_score"] = (df[column] - df[column].mean()) / df[column].std()

### Task 2.1 Score metric 1 - Retail businesses per 1000 people

In [None]:
### TODO

### Task 2.2 Score metric 2 - Health businesses per 1000 people

In [None]:
### TODO

### Task 2.3 Score metric 3 - Public Transport Availability

In [None]:
sql = '''
SELECT code AS "sa2_code", COUNT(code) AS "stops_count"
FROM sa2_bounds LEFT JOIN stops 
    ON (ST_Contains(geom, stop_loc))
GROUP BY code
'''
stops_num = query(conn, sql)


In [None]:
set_z_score(stops_num, 'stops_count')
stops_num_scr = stops_num.rename(columns=dict(stops_count='raw_score'))
stops_num_scr['score_desc'] = "# of train/bus stops"
stops_num_scr.to_sql('score_table', conn, if_exists='append', index=False)

### Task 2.4 Score metric 4 - Polling Places

In [31]:
sql = '''
SELECT code, COUNT(code) AS "poll_count"
FROM sa2_bounds, polls
WHERE ST_Contains(geom, poll_loc)
GROUP BY(code)
'''
polls_count = query(conn, sql)

In [None]:
sql = '''
SELECT * FROM sa2_bounds, polls
WHERE ST_Contains(geom, poll_loc);
'''
query(conn, sql)