In [99]:
import pandas as pd
import numpy as np

In [100]:
df_path= './data/RegionalProjections_noninteractive.xls'

In [101]:
df_2018 = pd.read_excel(df_path, sheet_name=5, skiprows=1)
df_2023 = pd.read_excel(df_path, sheet_name=4, skiprows=1)

In [102]:
#clean
df_2018['Region'] = df_2018['Region'].str.strip()
df_2023['Region'] = df_2018['Region'].str.strip()
df_2018.drop(df_2018.tail(5).index,inplace=True)

# Views we want
- Given SA4 or State and ANZSIC, give the 5 year predicted growth in # and %
- Given SA4 or State, give the top growing ANZSIC in # and % 
- Given SA4 get State

In [104]:
regions_states   = df_2018[df_2018['Level']==1.0]['Region'].unique()
regions_gccsa    = df_2018[df_2018['Level']==2.0]['Region'].unique()
regions_sa4      = df_2018[df_2018['Level']==3.0]['Region'].unique()
anzsic_divisions = df_2018.columns[3:]

In [105]:
def region_and_anzsic_to_growth(region, anzsic):
    # Given region and anzsic get growth
    num_2018 = int(df_2018[df_2018['Region']==region][anzsic].values[0]*1000)
    num_2023 = int(df_2023[df_2023['Region']==region][anzsic].values[0]*1000)
    
    growth   = num_2023-num_2018
    growth_pc= 100*growth/num_2018
    
    return np.array([num_2018, num_2023, growth, growth_pc])

In [106]:
region_and_anzsic_to_growth(regions_states[2], anzsic_divisions[3])

array([3.57210000e+04, 3.77830000e+04, 2.06200000e+03, 5.77251477e+00])

In [107]:
def region_to_growth(region):
    # given region get growth for each anzsic
    num_2018 = (df_2018[df_2018['Region']==region][anzsic_divisions].values[0]*1000).astype(np.int)
    num_2023 = (df_2023[df_2023['Region']==region][anzsic_divisions].values[0]*1000).astype(np.int)
    
    
    
    data = zip(anzsic_divisions, num_2018, num_2023, num_2023-num_2018, 100*(num_2023-num_2018)/num_2018)
    
    
    return pd.DataFrame(data = data, columns = ['ANZSIC Division','Jobs (2018)', 'Jobs (2019)', 'Job Growth (num)', 'Job Growth (%)']).set_index('ANZSIC Division')

In [108]:
region_to_growth('New South Wales')

Unnamed: 0_level_0,Jobs (2018),Jobs (2019),Job Growth (num),Job Growth (%)
ANZSIC Division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Agriculture, Forestry and Fishing",70108,64514,-5594,-7.979118
Mining,38696,38575,-121,-0.312694
Manufacturing,297318,302785,5467,1.838772
"Electricity, Gas, Water and Waste Services",38814,39868,1054,2.715515
Construction,376390,417426,41036,10.902521
Wholesale Trade,130984,128966,-2018,-1.540646
Retail Trade,425369,449565,24196,5.688238
Accommodation and Food Services,277168,297783,20615,7.437727
"Transport, Postal and Warehousing",202554,210963,8409,4.151486
Information Media and Telecommunications,96622,104639,8017,8.297282


In [120]:
data = []
current_state = None
current_gccsa = None
current_sa4   = None

for index, row in df_2018.iterrows():
    if row["Level"]==1.0:
        # New state
        current_state = row["Region"]
        current_gccsa = None
        continue
    if row["Level"]==2.0:
        # New GCCSA
        current_gccsa = row["Region"]
        continue
    if row["Level"]==3.0:
        current_sa4 = row["Region"]
        data.append([current_state, current_gccsa, current_sa4])

In [128]:
df_sa4_to_state = pd.DataFrame(data=data, columns=['State', 'GCCSA', 'SA4']).set_index('SA4')

In [129]:
df_sa4_to_state.head()

Unnamed: 0_level_0,State,GCCSA
SA4,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Coast,New South Wales,Greater Sydney
Sydney - Baulkham Hills and Hawkesbury,New South Wales,Greater Sydney
Sydney - Blacktown,New South Wales,Greater Sydney
Sydney - City and Inner South,New South Wales,Greater Sydney
Sydney - Eastern Suburbs,New South Wales,Greater Sydney


# Map from postcode to SA4
We use the correspondences from here:
https://data.gov.au/dataset/ds-dga-4b208cc1-f5de-405d-af96-0777645dfc87/distribution/dist-dga-24b6c091-aca6-4257-8a71-dfb0ce47bb3e/details?q=correspondence

In particular, we're interested in the file `CG_POSTCODE_2012_SA4_2011.xls`

In [130]:
df_path_postcodes = './data/CG_POSTCODE_2012_SA4_2011.xls'

In [144]:
df_postcodes = pd.read_excel(df_path_postcodes, sheet_name=3, skiprows=5)
df_postcodes.drop(df_postcodes.head(1).index,inplace=True)
df_postcodes.drop(df_postcodes.tail(4).index,inplace=True)
df_postcodes = df_postcodes[['POSTCODE_2012.1','SA4_CODE_2011','SA4_NAME_2011']]
df_postcodes.columns = ['Postcode', 'SA4_Code', 'SA4_Name']
df_postcodes.set_index('Postcode', inplace=True)

In [147]:
df_postcodes.sample(10)

Unnamed: 0_level_0,SA4_Code,SA4_Name
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
5043.0,403.0,Adelaide - South
4517.0,316.0,Sunshine Coast
2829.0,105.0,Far West and Orana
3181.0,206.0,Melbourne - Inner
2421.0,106.0,Hunter Valley exc Newcastle
3977.0,212.0,Melbourne - South East
3052.0,206.0,Melbourne - Inner
3712.0,204.0,Hume
5035.0,401.0,Adelaide - Central and Hills
2120.0,121.0,Sydney - North Sydney and Hornsby


In [152]:
def postcode_to_region(pc):
    sa4    = df_postcodes['SA4_Name'][pc]
    gccsa  = df_sa4_to_state['GCCSA'][sa4]
    state  = df_sa4_to_state['State'][sa4]
    
    return sa4, gccsa, state

In [153]:
postcode_to_region(3000)

('Melbourne - Inner', 'Greater Melbourne', 'Victoria')