## Importing Data Packages

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import json
from sqlalchemy import create_engine, MetaData
from sqlalchemy import Table, Column, Float, String
from sqlalchemy.sql import text
# do we need the from lines above except for sqlalchemy import create_engine?

## Setting Up Database Connection

In [2]:
# Define database connection parameters. 
# credentials = 'Credentials.json'
username = 'postgres'
password = 'Gumba11$'
host = 'localhost'
port = '5432'
database = 'project'

In [3]:
# Create connection URL
engine_url = f'postgresql://{username}:{password}@{host}:{port}/{database}'
#Establish database connection
engine = create_engine(engine_url)

In [4]:
schema = 'public'
table_names = {
    'Businesses': 'businesses',
    'Stops': 'stops',
    'Polls': 'polls',
    'Schools': 'schools',
    'Population': 'population',
    'Income': 'income'
}

In [5]:
metadata = MetaData(schema=schema)
metadata.bind = engine

## Importing Datasets

In [6]:
import os
cwd = os.getcwd()
print("Current working directory:", cwd)

Current working directory: /Users/abbykreutz/Desktop/DATA2001/Greater Sydney Analysis Assignment 


### Businesses.csv

In [7]:
# Import Businesses dataset
businesses_rawdf = pd.read_csv('Businesses.csv')
# Prepare a working copy
businesses_df = businesses_rawdf.copy()
businesses_df.to_sql(table_names['Businesses'], engine, schema=schema, if_exists='replace', index=False)

217

In [8]:
# businesses_df = businesses_df[['industry_code', 'industry_name', 'sa2_code', 'sa2_name', '0_to_50k_businesses', '50k_to_200k_businesses', '200k_to_2m_businesses', '2m_to_5m_businesses', '5m_to_10m_businesses', '10m_or_more_businesses', 'total_businesses']]
len(businesses_df)

12217

In [9]:
# Import Polls dataset
polls_rawdf = pd.read_csv('PollingPlaces2019.csv')
polls_df = polls_rawdf.copy()
polls_df.to_sql(table_names['Polls'], engine, schema=schema, if_exists='replace', index=False)
polls_df.head(10)

Unnamed: 0,FID,state,division_id,division_name,polling_place_id,polling_place_type_id,polling_place_name,premises_name,premises_address_1,premises_address_2,premises_address_3,premises_suburb,premises_state_abbreviation,premises_post_code,latitude,longitude,the_geom
0,aec_federal_election_polling_places_2019.fid-4...,NSW,104,Barton,33595,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
1,aec_federal_election_polling_places_2019.fid-4...,NSW,105,Bennelong,33596,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
2,aec_federal_election_polling_places_2019.fid-4...,NSW,107,Blaxland,33600,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
3,aec_federal_election_polling_places_2019.fid-4...,NSW,109,Calare,33603,2,Special Hospital Team 1,Multiple sites,,,,ORANGE,NSW,2800.0,,,
4,aec_federal_election_polling_places_2019.fid-4...,NSW,113,Cowper,33716,2,Special Hospital Team 2,Multiple sites,,,,,NSW,,,,
5,aec_federal_election_polling_places_2019.fid-4...,NSW,121,Grayndler,33623,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
6,aec_federal_election_polling_places_2019.fid-4...,NSW,126,Hunter,33627,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
7,aec_federal_election_polling_places_2019.fid-4...,NSW,127,Kingsford Smith,33629,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
8,aec_federal_election_polling_places_2019.fid-4...,NSW,249,Paterson,33628,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,
9,aec_federal_election_polling_places_2019.fid-4...,NSW,144,Reid,33638,2,Special Hospital Team 1,Multiple sites,,,,,NSW,,,,


In [10]:
# Import Stops dataset
stops_rawdf = pd.read_csv('Stops.txt', delimiter=',')
stops_df = stops_rawdf.copy()
stops_df.to_sql(table_names['Stops'], engine, schema=schema, if_exists='replace', index=False)
len(stops_df)

114718

In [11]:
# Import Population dataset
population_rawdf = pd.read_csv('Population.csv')
population_df = population_rawdf.copy()
population_df.to_sql(table_names['Population'], engine, schema=schema, if_exists='replace', index=False)
population_df.head(10)

Unnamed: 0,sa2_code,sa2_name,0-4_people,5-9_people,10-14_people,15-19_people,20-24_people,25-29_people,30-34_people,35-39_people,...,45-49_people,50-54_people,55-59_people,60-64_people,65-69_people,70-74_people,75-79_people,80-84_people,85-and-over_people,total_people
0,102011028,Avoca Beach - Copacabana,424,522,623,552,386,222,306,416,...,572,602,570,520,464,369,226,142,70,7530
1,102011029,Box Head - MacMasters Beach,511,666,702,592,461,347,420,535,...,749,749,794,895,863,925,603,331,264,11052
2,102011030,Calga - Kulnura,200,225,258,278,274,227,214,286,...,325,436,422,397,327,264,190,100,75,4748
3,102011031,Erina - Green Point,683,804,880,838,661,502,587,757,...,859,882,901,930,917,1065,976,773,1028,14803
4,102011032,Gosford - Springfield,1164,1044,1084,1072,1499,1864,1750,1520,...,1330,1241,1377,1285,1166,949,664,476,537,21346
5,102011033,Kariong,433,510,512,525,407,411,356,441,...,491,481,435,365,241,157,97,90,65,6518
6,102011034,Kincumber - Picketts Valley,430,434,491,423,342,313,419,394,...,419,415,422,459,441,468,455,383,523,7628
7,102011035,Narara,472,518,535,441,373,420,457,522,...,505,474,461,457,367,322,201,95,80,7191
8,102011036,Niagara Park - Lisarow,461,589,656,592,463,400,494,514,...,581,567,572,498,392,347,212,161,189,8237
9,102011037,Point Clare - Koolewong,356,411,389,355,300,251,358,423,...,493,427,480,443,346,367,300,241,241,6575


In [12]:
# Import Income dataset
income_rawdf = pd.read_csv('Income.csv')
income_df = income_rawdf.copy()
income_df.to_sql(table_names['Income'], engine, schema=schema, if_exists='replace', index=False)

642

In [13]:
import geopandas as gpd
from sqlalchemy import text

In [14]:
from sqlalchemy.exc import SQLAlchemyError

create_postgis_extension = "CREATE EXTENSION IF NOT EXISTS postgis;"
enable_postgis_extension = "CREATE EXTENSION IF NOT EXISTS postgis_topology;"

with engine.connect() as connection:
        connection.execute(text(create_postgis_extension))
        connection.execute(text(enable_postgis_extension))
        connection.commit()

In [15]:
sa2_regions_path = 'SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp'
sa2_regions_gdf = gpd.read_file(sa2_regions_path)
sa2_regions_gdf.to_postgis(name='sa2_regions', con=engine, if_exists='replace')

In [16]:
unique_constraint_query = """
ALTER TABLE sa2_regions
ADD CONSTRAINT unique_sa2_code21 UNIQUE ("SA2_CODE21");
"""

with engine.connect() as connection:
        connection.execute(text(unique_constraint_query))
        connection.commit()

In [17]:
future_df = gpd.read_file('catchments/catchments_future.shp')
primary_df = gpd.read_file('catchments/catchments_primary.shp')
secondary_df = gpd.read_file('catchments/catchments_secondary.shp')

In [18]:
concatenated_df = gpd.GeoDataFrame(pd.concat([future_df, primary_df, secondary_df], ignore_index=True))
concatenated_df.to_postgis('schools', engine, if_exists='replace', index=True, index_label='Index')

In [19]:
alter_queries = [
    "ALTER TABLE population ALTER COLUMN sa2_code TYPE TEXT;",
    "ALTER TABLE businesses ALTER COLUMN sa2_code TYPE TEXT;",
    "ALTER TABLE income ALTER COLUMN sa2_code21 TYPE TEXT;",
]

with engine.connect() as connection:
    for query in alter_queries:
        connection.execute(text(query))
    connection.commit()

In [20]:
primary_key_constraints = {
    'Stops': 'stop_id',
    'Polls': 'FID',
    'Population': 'sa2_code',
    'Income': 'sa2_code21',
    'Schools': 'Index',
}

In [21]:
# Apply Primary Key Constraints
for table_name, primary_key_column in primary_key_constraints.items():
    with engine.connect() as connection:
        primary_key_query = f'ALTER TABLE "{schema}"."{table_names[table_name]}" ADD PRIMARY KEY ("{primary_key_column}");'
        connection.execute(text(primary_key_query))
        connection.commit()

In [22]:
query = "ALTER TABLE businesses ADD CONSTRAINT pk_businesses PRIMARY KEY (industry_code, sa2_code);"

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit()

In [23]:
query = 'ALTER TABLE sa2_regions ADD PRIMARY KEY ("SA2_CODE21");'

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit()

In [24]:
foreign_key_constraints = {
    'Population': {'sa2_code': 'VARCHAR'},
    'Income': {'sa2_code21': 'VARCHAR'},
    'Businesses': {'sa2_code': 'VARCHAR'}
}

In [25]:
# Apply Foreign Key Constraints
for table_name, columns in foreign_key_constraints.items():
    for column_name, data_type in columns.items():
        if table_name in ['Stops', 'Polling']:
            referenced_column = 'geometry'
        else:
            referenced_column = 'SA2_CODE21'
        
        alter_query = f'ALTER TABLE "{schema}"."{table_names[table_name]}" ' \
                      f'ADD CONSTRAINT fk_{table_name}_{column_name} ' \
                      f'FOREIGN KEY ("{column_name}") ' \
                      f'REFERENCES "{schema}".sa2_regions ("{referenced_column}");'
        
        with engine.connect() as connection:
            connection.execute(text(alter_query))
            connection.commit()

In [26]:
query = """
ALTER TABLE polls
ADD COLUMN sa2_code TEXT;

UPDATE polls
SET sa2_code = sa2_regions."SA2_CODE21"
FROM sa2_regions
WHERE ST_Contains(sa2_regions.geometry, ST_SetSRID(ST_MakePoint(polls.longitude, polls.latitude), 7844));

ALTER TABLE polls
ADD CONSTRAINT fk_sa2_code
FOREIGN KEY (sa2_code)
REFERENCES sa2_regions ("SA2_CODE21");
"""

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit()

In [27]:
query = """
ALTER TABLE stops
ADD COLUMN sa2_code TEXT;

UPDATE stops
SET sa2_code = sa2_regions."SA2_CODE21"
FROM sa2_regions
WHERE ST_Contains(sa2_regions.geometry, ST_SetSRID(ST_MakePoint(stops.stop_lon, stops.stop_lat), 7844));

ALTER TABLE stops
ADD CONSTRAINT fk_sa2_code
FOREIGN KEY (sa2_code)
REFERENCES sa2_regions ("SA2_CODE21");
"""

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit()

In [2]:
query = """
ALTER TABLE schools
ADD COLUMN geom_transformed GEOMETRY;

UPDATE schools
SET geom_transformed = ST_Transform(geometry, 7844);

ALTER TABLE schools
ADD COLUMN sa2_code TEXT;

UPDATE schools
SET sa2_code = UPPER(sa2_regions."SA2_CODE21")
FROM sa2_regions
WHERE ST_Intersects(schools.geom_transformed, sa2_regions.geometry);

ALTER TABLE schools
ADD CONSTRAINT fk_sa2_code
FOREIGN KEY (sa2_code)
REFERENCES sa2_regions ("SA2_CODE21");
"""

with engine.connect() as connection:
    connection.execute(text(query))
    connection.commit()

NameError: name 'engine' is not defined

In [29]:
metadata.create_all(bind=engine)

In [30]:
print("done")

done
