In [152]:
import pandas as pd
import json

All csv files used were generated from Andrew Hill's spreadsheets. His email is andrew.hill@tpsgc-pwgsc.gc.ca. In his construction spreadsheet there are 2 sheets that seem identical to me. I used the 1st one.

## Services

In [153]:
services = pd.read_csv("../gsin/GSIN_Categories-Services.csv", header=1)

In [154]:
service_columns = ["L1", "L2", "meaningless", "gsin", "description_en"]

In [155]:
services.columns = service_columns

In [156]:
# fill forward missing values
services = services.fillna(method='ffill')

In [157]:
def get_L1_name(L1):
    parts = L1.split(".")
    if len(parts) == 1:
        return parts[0]
    return parts[1].strip()

def get_L1_code(L1):
    parts = L1.split(".")
    if len(parts) == 1:
        return parts[0]
    return parts[0].strip()

def get_L2_name(L2):
    parts = L2.split(" ")
    if len(parts) == 1:
        return parts[0]
    return " ".join(parts[1:]).strip()

def get_L2_code(L2):
    parts = L2.split(" ")
    if len(parts) == 1:
        return parts[0]
    return parts[0].strip()

In [158]:
services["L1_name"] = services["L1"].map(get_L1_name)
services["L1_code"] = services["L1"].map(get_L1_code)
services["L2_name"] = services["L2"].map(get_L2_name)
services["L2_code"] = services["L2"].map(get_L2_code)

In [159]:
services["L3_code"] = services["gsin"]
services["L3_name"] = services["description_en"]

In [160]:
services["L0_name"] = ["Services"]*len(services)

## Goods

In [161]:
goods = pd.read_csv("../gsin/GSIN_Categories-Goods.csv", header=1)

In [162]:
goods_cols = ["L1", "L2", "gsin", "description_en"]

In [163]:
goods.columns = goods_cols

In [164]:
goods = goods.fillna(method='ffill')

In [165]:
goods["L1_code"] = goods["L1"].map(lambda x: "N" + x.split(" ")[0])
goods["L1_name"] = goods["L1"].map(lambda x: " ".join(x.split(" ")[1:]))

In [166]:
goods["L2_code"] = goods["L2"].map(lambda x: "N" + x.split(" ")[0])
goods["L2_name"] = goods["L2"].map(lambda x: " ".join(x.split(" ")[1:]))

In [167]:
goods["L3_code"] = goods["gsin"]
goods["L3_name"] = goods["description_en"]

In [168]:
goods["L0_name"] = ["Goods"]*len(goods)

## Services related to goods

In [169]:
srg = pd.read_csv("../gsin/GSIN_Categories-Services_Related_to_Goods.csv", header=1)

In [170]:
srg.columns = goods_cols

In [171]:
srg = srg.fillna(method='ffill')

In [172]:
# srg.drop(index=srg.query("gsin == '-'").index, inplace=True)

In [173]:
srg["L1_name"] = srg["L1"].map(get_L1_name)

In [174]:
srg["L1_code"] = srg["L1"].map(get_L1_code)

In [175]:
srg["L2_name"] = srg["L2"].map(get_L2_name)
srg["L2_code"] = srg["L2"].map(get_L2_code)

In [176]:
srg["L0_name"] = ["Services"]*len(srg)

In [177]:
srg["L3_code"] = srg["gsin"]
srg["L3_name"] = srg["description_en"]

In [178]:
# there are 2 L1 categegories and only 2 L2 categories, so the L2 categories appear to be the same thing
# however there are ~900 L3/full gsin categories. probably worth going to L3 eventually

## Construction

In [191]:
construction = pd.read_csv("../gsin/Construction_Codes_CPC.csv", header=1)

In [192]:
construction_columns = ["L1", "L2", "L3_code", "L3_name", "L3_name_fr", "a", "b", "c", "d"]

In [193]:
construction.columns = construction_columns

In [194]:
del construction["a"]
del construction["b"]
del construction["c"]
del construction["d"]

In [195]:
construction = construction.fillna(method='ffill')

In [196]:
construction.dropna(how="all", inplace=True)
construction.drop(index=[159, 160], inplace=True) # the last 2 rows don't contain data

In [197]:
construction["L1_code"] = construction["L1"].map(lambda x: x.split(" ")[0])
construction["L1_name"] = construction["L1"].map(lambda x: " ".join(x.split(" ")[1:]))

In [198]:
construction["L2_code"] = construction["L2"].map(lambda x: x.split(" ")[0])
construction["L2_name"] = construction["L2"].map(lambda x: " ".join(x.split(" ")[1:]))

In [199]:
construction["L0_name"] = ["Construction"]*len(construction)

## Save dicts

In [200]:
cols_of_interest = ["L0_name", "L1_name", "L1_code", "L2_name", "L2_code", "L3_name", "L3_code"]

In [201]:
df = pd.concat([services[cols_of_interest], goods[cols_of_interest], srg[cols_of_interest], construction[cols_of_interest]])

In [202]:
df["L3_name"] = df["L3_name"].map(lambda x: x.replace("\n", ""))

In [203]:
df.index = range(len(df))

In [204]:
L1_dict = {}
L1_dict_reverse = {}
L2_dict = {}
L2_dict_reverse = {}
L3_dict = {}
L3_dict_reverse = {}

for i in df.index:
    L1 = df["L1_code"][i]
    L2 = df["L2_code"][i]
    L3 = df["L3_code"][i]
    L1_fullname = df["L0_name"][i] + " > " + df["L1_name"][i]
    L2_fullname = df["L0_name"][i] + " > " + df["L1_name"][i] + " > " + df["L2_name"][i]
    L3_fullname = df["L0_name"][i] + " > " + df["L1_name"][i] + " > " + df["L2_name"][i] + " > " + df["L3_name"][i]
    
    L1_dict[L1_fullname] = L1
    L1_dict_reverse[L1] = L1_fullname
    
    L2_dict[L2_fullname] = L2
    L2_dict_reverse[L2] = L2_fullname
    
    L3_dict[L3_fullname] = L3
    L3_dict_reverse[L3] = L3_fullname

In [205]:
full_dict_reverse = {**L1_dict_reverse, **L2_dict_reverse, **L3_dict_reverse}

In [206]:
json.dump(L1_dict, open("../gsin/L1_dict.json", "w"))
json.dump(L1_dict, open("../../single-point-of-access-prototype/data/L1_dict.json", "w"))

In [207]:
json.dump(L1_dict_reverse, open("../gsin/L1_dict_reverse.json", "w"))
json.dump(L1_dict_reverse, open("../../single-point-of-access-prototype/data/L1_dict_reverse.json", "w"))

In [208]:
json.dump(L2_dict, open("../gsin/L2_dict.json", "w"))
json.dump(L2_dict, open("../../single-point-of-access-prototype/data/L2_dict.json", "w"))

In [211]:
json.dump(L2_dict_reverse, open("../gsin/L2_dict_reverse.json", "w"))
json.dump(L2_dict_reverse, open("../../single-point-of-access-prototype/data/L2_dict_reverse.json", "w"))

In [212]:
json.dump(full_dict_reverse, open("../gsin/full_dict_reverse.json", "w"))
json.dump(full_dict_reverse, open("../../single-point-of-access-prototype/data/full_dict_reverse.json", "w"))