In [149]:
import pandas as pd
import json

In [86]:
services = pd.read_csv("../gsin/GSIN_Categories-Services.csv", header=1)
goods = pd.read_csv("../gsin/GSIN_Categories-Goods.csv", header=1)
srg = pd.read_csv("../gsin/GSIN_Categories-Services_Related_to_Goods.csv", header=1)

## Services

In [89]:
service_columns = ["L1", "L2", "meaningless", "gsin", "description_en"]

In [90]:
services.columns = service_columns

In [91]:
# fill forward missing values
services = services.fillna(method='ffill')

In [106]:
def get_L1_name(L1):
    parts = L1.split(".")
    if len(parts) == 1:
        return parts[0]
    return parts[1].strip()

def get_L1_code(L1):
    parts = L1.split(".")
    if len(parts) == 1:
        return parts[0]
    return parts[0].strip()

def get_L2_name(L2):
    parts = L2.split(" ")
    if len(parts) == 1:
        return parts[0]
    return " ".join(parts[1:]).strip()

def get_L2_code(L2):
    parts = L2.split(" ")
    if len(parts) == 1:
        return parts[0]
    return parts[0].strip()

In [95]:
services["L1_name"] = services["L1"].map(get_L1_name)

In [97]:
services["L1_code"] = services["L1"].map(get_L1_code)

In [104]:
services["L2_name"] = services["L2"].map(get_L2_name)

In [107]:
services["L2_code"] = services["L2"].map(get_L2_code)

## Goods

In [136]:
goods_cols = ["L1", "L2", "gsin", "description_en"]

In [137]:
goods.columns = goods_cols

In [138]:
goods = goods.fillna(method='ffill')

In [139]:
goods["L1_code"] = goods["L1"].map(lambda x: "N" + x.split(" ")[0])
goods["L1_name"] = goods["L1"].map(lambda x: " ".join(x.split(" ")[1:]))

In [140]:
goods["L2_code"] = goods["L2"].map(lambda x: "N" + x.split(" ")[0])
goods["L2_name"] = goods["L2"].map(lambda x: " ".join(x.split(" ")[1:]))

## Services related to goods

In [113]:
srg.columns = goods_cols

In [116]:
srg = srg.fillna(method='ffill')

In [118]:
srg["L1_name"] = srg["L1"].map(get_L1_name)

In [121]:
srg["L1_code"] = srg["L1"].map(get_L1_code)

In [130]:
srg["L2_name"] = srg["L2"].map(get_L2_name)
srg["L2_code"] = srg["L2"].map(get_L2_code)

In [131]:
# there are 2 L1 categegories and only 2 L2 categories, so the L2 categories appear to be the same thing
# however there are ~900 L3/full gsin categories. probably worth going to L3 eventually

## Save dicts

In [132]:
cols_of_interest = ["L1_name", "L1_code", "L2_name", "L2_code"]

In [142]:
df = pd.concat([services[cols_of_interest], goods[cols_of_interest], srg[cols_of_interest]])

In [144]:
df.index = range(len(df))

In [146]:
L1_dict = {}
L2_dict = {}

for i in df.index:
    L1 = df["L1_code"][i]
    L2 = df["L2_code"][i]
    L1_dict[L1] = df["L1_name"][i]
    L2_dict[L2] = df["L1_name"][i] + " > " + df["L2_name"][i]    

In [153]:
json.dump(L1_dict, open("../gsin/L1_dict.json", "w"))
json.dump(L1_dict, open("../../single-point-of-access-prototype/data/L1_dict.json", "w"))

In [154]:
json.dump(L2_dict, open("../gsin/L2_dict.json", "w"))
json.dump(L2_dict, open("../../single-point-of-access-prototype/data/L2_dict.json", "w"))