# Create OGSL table
Create a DataFrame that associates every sign value that is recognized in [ORACC](http://oracc.org) with a unicode code point (or a sequence of unicode code points). The table may be used to translate a text in transliteration into a sequence of unicode code points for use in `fasttext`. The data are derived from the ORACC Global Sign List ([OGSL](http://build-oracc.museum.upenn.edu)).


In [1]:
import pandas as pd
import zipfile
import json
import os
import sys
util_dir = os.path.abspath('../utils')
sys.path.append(util_dir)
from utils import *
import pickle

## 0 Create Directories, if Necessary
The two directories needed for this script are `jsonzip` and `output`. The directories are created with the function `make_dirs()` from the `utils` module. 

In [2]:
directories = ['jsonzip', 'output']
make_dirs(directories)

## 1 Download the ZIP file

In [3]:
project = ["ogsl"] # oracc_download() expects a list
oracc_download(project)

Downloading http://build-oracc.museum.upenn.edu/json/ogsl.zip saving as jsonzip/ogsl.zip


12it [00:00, 31.66it/s]


# <a name="head21"></a>2 The `parsejson()` function

In [4]:
def parsejson(data_json):
    for key, value in data_json["signs"].items():
        if "values" in value:
            for n in value["values"]:
                s["value"] = n
                s["name"] = key
                if "utf8" in value:
                    s["utf8"] = value["utf8"]
                    s["hex"] = value["hex"] 
                else:
                    s["utf8"] = ""
                    s["hex"] = ""
                sign = {key : value for key, value in s.items()}
                s_l.append(sign)
    return

# 3 Main Process

In [5]:
s = {}
s_l = []
file = "jsonzip/ogsl.zip"
z = zipfile.ZipFile(file) 
filename = "ogsl/ogsl-sl.json"
signlist = z.read(filename).decode('utf-8')
data_json = json.loads(signlist)                # make it into a json object (essentially a dictionary)
parsejson(data_json)  

# 4 Make Dataframe

In [6]:
df = pd.DataFrame(s_l)
df

Unnamed: 0,hex,name,utf8,value
0,x12000,A,𒀀,ʾu₄
1,x12000,A,𒀀,a
2,x12000,A,𒀀,aia₂
3,x12000,A,𒀀,aya₂
4,x12000,A,𒀀,barₓ
5,x12000,A,𒀀,buniŋₓ
6,x12000,A,𒀀,burₓ
7,x12000,A,𒀀,dur₅
8,x12000,A,𒀀,duru₅
9,x12000,A,𒀀,e₄


In [7]:
with open("output/ogsl.p", "wb") as p:
    pickle.dump(df, p)