In [18]:
import pandas as pd
import re
import numpy as np
datadir = "data"

# Extract the Data from XLS

In [3]:
def extract_anzsic_and_year(filename : str, sheet_number : int, header : int):
    df = pd.read_excel("{}".format(filename), sheet_name=sheet_number, header=header, skipfooter=3)

    # Set up Regex
    re_pattern = {"subdivision" : "(\d\d) (.+)", "year" : "(\d\d\d\d)–(\d\d)"}

    # Iterate and restructure
    for i,row in df.iterrows():
        try:
            anzsic_match = re.match(re_pattern["subdivision"], row[0])
            if anzsic_match:
                anzsic = anzsic_match
        except TypeError:
            anzsic = None
        try:
            year = re.match(re_pattern["year"], row[0])
        except TypeError:
            year = None

        df.at[i, "ANZSIC Subdivision Code"] = anzsic.group(1) if anzsic else "NULL"
        df.at[i, "ANZSIC Subdivision Name"] = anzsic.group(2) if anzsic else "NULL"
        df.at[i, "Year"] = year.group(1) if year else "NULL"

    del df["Unnamed: 0"]
    df = df[df["Year"] != "NULL"]
    return df

In [37]:
sheets = [{"sheet_number" : 1, "header" : 4},\
          {"sheet_number" : 2, "header" : 4},\
          {"sheet_number" : 3, "header" : 6},\
          {"sheet_number" : 4, "header" : 6},\
         ]

dfs = [extract_anzsic_and_year("data/performance_by_anzsic.xls", **s) for s in sheets]

In [39]:
# Join what we can
df1 = pd.concat(dfs[:-1], axis=1, sort=False)
df1 = df1.loc[:, ~df1.columns.duplicated()]
# And the leave rest
df2 = dfs[-1]

# Fix the hidden data
for i,row in df2.iterrows():
    p_e_l = row[-6:-3].values
    if any(map(lambda x: np.isnan(x), p_e_l)):
        df2.at[i, "Broke even"] = 0
        df2.at[i, "Made a profit"] = 100 - p_e_l[-1]
        

# Write to disk

In [44]:
# Pickle away
df1.to_pickle("data/pickles/performance_by_anzsic.pkl")
df2.to_pickle("data/pickles/performance_by_anzsic_supplementary.pkl")

# Query Function for external use

In [276]:
def anzsic_data(anzsic_subdivision : int, pickledir = "data/pickles"):
    anzsic_subdivision = "{:02d}".format(anzsic_subdivision)
    df1 = pd.read_pickle("{}/performance_by_anzsic.pkl".format(pickledir))
    df1_slice = df1[df1["ANZSIC Subdivision Code"] == anzsic_subdivision]
    
    df2 = pd.read_pickle("{}/performance_by_anzsic_supplementary.pkl".format(pickledir))
    df2_slice = df2[df2["ANZSIC Subdivision Code"] == anzsic_subdivision]
    return df1_slice.set_index("Year"), df2_slice.set_index("Year")

