In [31]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt

In [30]:
# Import all necessary datasets for the report + initial filtering
sa2_bounds_raw = gpd.read_file("space_data/SA2.shp")
businesses_raw = pd.read_csv("other_data/Businesses.csv")
stops_raw = pd.read_csv("other_data/Stops.txt")
polls_raw = pd.read_csv("other_data/PollingPlaces2019.csv")
schools_prima_raw = gpd.read_file("space_data/catchments/catchments_primary.shp")
schools_secon_raw = gpd.read_file("space_data/catchments/catchments_secondary.shp")
schools_futur_raw = gpd.read_file("space_data/catchments/catchments_future.shp")
populations_raw = pd.read_csv("other_data/Population.csv")
incomes_raw = pd.read_csv("other_data/Income.csv")

In [49]:
# Initial filtering 
# 1. W're only interested in "Greater Sydney" GCC)
print(sa2_bounds_raw.GCC_NAME21.value_counts().head())
print("... more regions (truncated)\n")
sa2_bounds = sa2_bounds_raw[sa2_bounds_raw.GCC_NAME21 == "Greater Sydney"]
print(sa2_bounds.GCC_NAME21.value_counts())
# 2. We'll only be conducting analysis on SA2 regions. We'll not be 
# examining ins encompassing(broader) regions such as SA3, SA4, and states. 
print("All columns: ", sa2_bounds.columns)
sa2_bounds = sa2_bounds.loc[:, ["SA2_CODE21", "SA2_NAME21", "AREASQKM21", "geometry"]]

Greater Sydney       373
Greater Melbourne    361
Rest of Qld          300
Rest of NSW          269
Greater Brisbane     246
Name: GCC_NAME21, dtype: int64
... more regions (truncated)

Greater Sydney    373
Name: GCC_NAME21, dtype: int64
Index(['SA2_CODE21', 'SA2_NAME21', 'CHG_FLAG21', 'CHG_LBL21', 'SA3_CODE21',
       'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21', 'GCC_CODE21', 'GCC_NAME21',
       'STE_CODE21', 'STE_NAME21', 'AUS_CODE21', 'AUS_NAME21', 'AREASQKM21',
       'LOCI_URI21', 'geometry'],
      dtype='object')


In [48]:
sa2_bounds.info()
''' 
description: Statistical Region 2 (SA2) digital boundaries (Greater Sydney only)
entries: 373
columns: 17 (all columns are complete with no missing values for all entries)
'''
# sa2_table = 
# '''

# '''
sa2_bounds

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 373 entries, 28 to 641
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   SA2_CODE21  373 non-null    object  
 1   SA2_NAME21  373 non-null    object  
 2   AREASQKM21  373 non-null    float64 
 3   geometry    373 non-null    geometry
dtypes: float64(1), geometry(1), object(2)
memory usage: 14.6+ KB


Unnamed: 0,SA2_CODE21,SA2_NAME21,AREASQKM21,geometry
28,102011028,Avoca Beach - Copacabana,6.4376,"POLYGON ((151.41373 -33.46558, 151.41362 -33.4..."
29,102011029,Box Head - MacMasters Beach,32.0802,"POLYGON ((151.37484 -33.50052, 151.37507 -33.5..."
30,102011030,Calga - Kulnura,767.9512,"MULTIPOLYGON (((151.20449 -33.53280, 151.20448..."
31,102011031,Erina - Green Point,33.7934,"POLYGON ((151.37194 -33.43698, 151.37288 -33.4..."
32,102011032,Gosford - Springfield,16.9123,"POLYGON ((151.32349 -33.42779, 151.32342 -33.4..."
...,...,...,...,...
637,128021537,Royal National Park,139.3336,"POLYGON ((151.07363 -34.05638, 151.07360 -34.0..."
638,128021538,Sutherland - Kirrawee,7.7550,"POLYGON ((151.05006 -34.02158, 151.05008 -34.0..."
639,128021607,Engadine,8.9538,"POLYGON ((150.99568 -34.05361, 150.99570 -34.0..."
640,128021608,Loftus - Yarrawarrah,3.8436,"POLYGON ((151.03955 -34.04175, 151.03954 -34.0..."


In [38]:
# IDA (businesses)
businesses_raw.info()
'''
description: Number of businesses by industry and SA2 region, reported by turnover size ranges
entires: 12217
columns: 11 (all columns are complete with no missing values for all entries)
'''
# businesses_raw.sa2_name.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12217 entries, 0 to 12216
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   industry_code           12217 non-null  object
 1   industry_name           12217 non-null  object
 2   sa2_code                12217 non-null  int64 
 3   sa2_name                12217 non-null  object
 4   0_to_50k_businesses     12217 non-null  int64 
 5   50k_to_200k_businesses  12217 non-null  int64 
 6   200k_to_2m_businesses   12217 non-null  int64 
 7   2m_to_5m_businesses     12217 non-null  int64 
 8   5m_to_10m_businesses    12217 non-null  int64 
 9   10m_or_more_businesses  12217 non-null  int64 
 10  total_businesses        12217 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 1.0+ MB


'\ndescription: Number of businesses by industry and SA2 region, reported by turnover size ranges\nentires: 12217\ncolumns: 11 (all columns are complete with no missing values for all entries)\n'

In [35]:
stops_raw.info()
'''
description: Number of businesses by industry and SA2 region, reported by turnover size ranges
entires: 114718
columns: 9 (several columns missing location_type, parent_station, stop_code, 
        and other fields for many entires. Almost all entries have missing platform code)
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114718 entries, 0 to 114717
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   stop_id              114718 non-null  object 
 1   stop_code            60719 non-null   float64
 2   stop_name            114718 non-null  object 
 3   stop_lat             114718 non-null  float64
 4   stop_lon             114718 non-null  float64
 5   location_type        53991 non-null   float64
 6   parent_station       60727 non-null   object 
 7   wheelchair_boarding  114718 non-null  int64  
 8   platform_code        871 non-null     object 
dtypes: float64(4), int64(1), object(4)
memory usage: 7.9+ MB


'\ndescription: Number of businesses by industry and SA2 region, reported by turnover size ranges\nentires: 114718\ncolumns: 9 (several columns missing location_type, parent_station, stop_code, \n        and other fields for many entires. Almost all entries have missing platform code)\n'

In [36]:
polls_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   FID                          2930 non-null   object 
 1   state                        2930 non-null   object 
 2   division_id                  2930 non-null   int64  
 3   division_name                2930 non-null   object 
 4   polling_place_id             2930 non-null   int64  
 5   polling_place_type_id        2930 non-null   int64  
 6   polling_place_name           2930 non-null   object 
 7   premises_name                2930 non-null   object 
 8   premises_address_1           2737 non-null   object 
 9   premises_address_2           114 non-null    object 
 10  premises_address_3           35 non-null     object 
 11  premises_suburb              2815 non-null   object 
 12  premises_state_abbreviation  2930 non-null   object 
 13  premises_post_code