In [2]:
import pandas as pd
import numpy as np
import pickle

In [3]:
anzsic_path = './data/anzsic_codes_and_titles.xls'

In [4]:
df = pd.read_excel(anzsic_path, sheet_name=4, skiprows=5)

In [5]:
new_columns = ['None',
               'DivisionCode',
               'SubdivisionCode',
               'GroupCode',
               'ClassCode',
               'ClassName'
              ]

In [6]:
df.columns=  new_columns
df.head()

Unnamed: 0,None,DivisionCode,SubdivisionCode,GroupCode,ClassCode,ClassName
0,,A,"Agriculture, Forestry and Fishing",,,
1,,,01,Agriculture,,
2,,,,011,Nursery and Floriculture Production,
3,,,,,0111,Nursery Production (Under Cover)
4,,,,,0112,Nursery Production (Outdoors)


This is a disgusting way of doing it, but let's iterate over rows

In [7]:
class_list = []

current_div_code = None
current_div_name = None
current_subdiv_code = None
current_subdiv_name = None
current_group_code = None
current_group_name = None

for index, row in df.iterrows():
    if not pd.isna(row['DivisionCode']):
        # New division
        current_div_code = row['DivisionCode']
        current_div_name = row['SubdivisionCode']
        continue
    if not pd.isna(row['SubdivisionCode']) and len(row['SubdivisionCode'])==2:
        # New subdivision
        current_subdiv_code = row['SubdivisionCode']
        current_subdiv_name = row['GroupCode']
        continue
    if not pd.isna(row['GroupCode']) and len(row['GroupCode'])==3:
        # New group
        current_group_code = row['GroupCode']
        current_group_name = row['ClassCode']
        continue
    
    if not pd.isna(row['ClassCode']) and not pd.isna(row['ClassName']):
        class_list.append([current_div_code,
                          current_div_name,
                          current_subdiv_code,
                          current_subdiv_name,
                          current_group_code,
                          current_group_name,
                          row['ClassCode'],
                          row['ClassName']])
    
        
    

In [8]:
# This dataframe contains a different class for each row
# Columns describe div,subdiv,group names and codes
column_names = columns=['DivisionCode',
                             'DivisionName',
                             'SubDivisionCode',
                             'SubDivisionName',
                             'GroupCode',
                             'GroupName',
                             'ClassCode',
                             'ClassName']
df_anzsic = pd.DataFrame(data=class_list,
                     columns=column_names)
df_anzsic.to_csv('anzsic_df.csv')
df_anzsic.head()


Unnamed: 0,DivisionCode,DivisionName,SubDivisionCode,SubDivisionName,GroupCode,GroupName,ClassCode,ClassName
0,A,"Agriculture, Forestry and Fishing",1,Agriculture,11,Nursery and Floriculture Production,111,Nursery Production (Under Cover)
1,A,"Agriculture, Forestry and Fishing",1,Agriculture,11,Nursery and Floriculture Production,112,Nursery Production (Outdoors)
2,A,"Agriculture, Forestry and Fishing",1,Agriculture,11,Nursery and Floriculture Production,113,Turf Growing
3,A,"Agriculture, Forestry and Fishing",1,Agriculture,11,Nursery and Floriculture Production,114,Floriculture Production (Under Cover)
4,A,"Agriculture, Forestry and Fishing",1,Agriculture,11,Nursery and Floriculture Production,115,Floriculture Production (Outdoors)


In [9]:
# This dict maps from a 4-digit ANZSIC code (as a string) to a dict of the div,subdiv,group names and codes
dict_anzsic = { x[6]: {column_names[i]:y for i,y in enumerate(x)} for x in class_list}

with open('anzsic_dict.pkl','wb') as f:
    pickle.dump(dict_anzsic, f)

dict_anzsic['1334']

{'DivisionCode': 'C',
 'DivisionName': 'Manufacturing',
 'SubDivisionCode': '13',
 'SubDivisionName': 'Textile, Leather, Clothing and Footwear Manufacturing',
 'GroupCode': '133',
 'GroupName': 'Textile Product Manufacturing',
 'ClassCode': '1334',
 'ClassName': 'Textile Finishing and Other Textile Product Manufacturing'}

In [10]:
dict_anzsic['1']

KeyError: '1'

In [12]:
df_anzsic.sample(20)

Unnamed: 0,DivisionCode,DivisionName,SubDivisionCode,SubDivisionName,GroupCode,GroupName,ClassCode,ClassName
199,C,Manufacturing,24,Machinery and Equipment Manufacturing,249,Other Machinery and Equipment Manufacturing,2491,Lifting and Material Handling Equipment Manufa...
23,A,"Agriculture, Forestry and Fishing",1,Agriculture,15,Other Crop Growing,151,Sugar Cane Growing
263,F,Wholesale Trade,35,Motor Vehicle and Motor Vehicle Parts Wholesaling,350,Motor Vehicle and Motor Vehicle Parts Wholesaling,3503,Trailer and Other Motor Vehicle Wholesaling
59,B,Mining,9,Non-Metallic Mineral Mining and Quarrying,91,Construction Material Mining,911,Gravel and Sand Quarrying
203,C,Manufacturing,25,Furniture and Other Manufacturing,251,Furniture Manufacturing,2513,Mattress Manufacturing
213,D,"Electricity, Gas, Water and Waste Services",26,Electricity Supply,264,On Selling Electricity and Electricity Market ...,2640,On Selling Electricity and Electricity Market ...
12,A,"Agriculture, Forestry and Fishing",1,Agriculture,13,Fruit and Tree Nut Growing,135,Stone Fruit Growing
252,F,Wholesale Trade,33,Basic Material Wholesaling,333,Timber and Hardware Goods Wholesaling,3332,Plumbing Goods Wholesaling
451,Q,Health Care and Social Assistance,84,Hospitals,840,Hospitals,8402,Psychiatric Hospitals
385,K,Financial and Insurance Services,64,Auxiliary Finance and Insurance Services,641,Auxiliary Finance and Investment Services,6411,Financial Asset Broking Services
