## Create data for fuzzy search

In [None]:
import pathlib
import pandas as pd
import json

## Set A

manually from <https://www.jcs.mil/portals/36/documents/doctrine/other_pubs/ms_2525d.pdf>

In [None]:
# from 001-kba notebook
with open('set_a_raw.json', 'r') as fp:
    tmp = json.load(fp)
x = pd.json_normalize(tmp).T.to_dict()[0]

with open('../json/set_a.json', 'w') as fp:
    json.dump(x, fp, ensure_ascii=False, indent=2)

## Set B 2525D

you need to clone https://github.com/banderlog/mil-std-2525

### Get entities

In [None]:
result = dict()
for i in pathlib.Path('../mil-std-2525/tsv-tables/2525d/').glob('*.tsv'):
    # get only entities
    if 'sector' in i.stem:
        continue
    try:
        x = dict()
        # create hierarchy JSON out of sparse Entity Collection dataframe
        #    A1 - - 1
        #    A2 - - 2  => {EC: {A1: 1, A2: {"": 2, B2: {"": 3, C2: 4}}}}
        #    - B2 - 3
        #    - - C2 4
        for _, row in pd.read_csv(i, delimiter='\t', keep_default_na=False, dtype='str').iterrows():
            # level A
            if row['Entity']:
                current_entity =  row['Entity']
                x[current_entity] = row['Code']
            # level B
            elif row['Entity Type']:
                current_entity_type = row['Entity Type']
                # make upper level dict, if it contains more than 1 value
                if not isinstance(x[current_entity], dict):
                    x[current_entity] = {"": x[current_entity]}
                # drop those
                if current_entity_type != '{Reserved for future use}':
                    x[current_entity][current_entity_type] = row['Code']
            # level C
            elif row['Entity Subtype']:
                current_entity_subtype = row['Entity Subtype']
                # make upper level dict, if it contains more than 1 value
                if not isinstance(x[current_entity][current_entity_type], dict):
                    x[current_entity][current_entity_type] = {"": x[current_entity][current_entity_type]}
                x[current_entity][current_entity_type][current_entity_subtype] = row['Code']
        result[i.stem] = x
    # 3 tab errors in Control Measures.tsv
    except Exception as e:
        print(e)
        print(i)

### Get modifiers

In [None]:
# get only files with modifiers
mods = [i for i in pathlib.Path('../mil-std-2525/tsv-tables/2525d/').glob('*.tsv') if 'sector' in i.stem]

# add 'modifier_1' and 'modifier_2' dicts to some Entity Collections
#   find which EC have modifiers
for k in result.keys():
    for i in mods:
        if k + ' sector' in i.stem:
            # read modifier file
            x = dict()
            _ = pd.read_csv(i, delimiter='\t', keep_default_na=False, dtype='str') 
            # detect it modifier_1 or modifier_2
            n = '1' if ('1' in i.stem) else '2'
            modname = 'First Modifier' if (n == '1') else 'Second Modifier'
            try:
                for _, row in _.iterrows():
                    # drop some
                    if row[modname] not in ['{Reserved for future use}', 'Version Extension Flag']:
                        x.update({row[modname]: row['Code']})
                result[k][f'modifier_{n}'] = x
            # we clean
            except Exception as e:
                print(e)
                print(i.stem)

In [None]:
# {EC: {A1: 1, A2: {"": 2, B2: {"": 3, C2: 4}}}} => 
#   EC.A1: 1
#   EC.A2: 2
#   EC.A2.B2: 3
#   EC.A2.B2.C2: 4
with open('../json/set_b_2525d.json', 'w') as fp:
    x = pd.json_normalize(result).T.to_dict()[0]
    json.dump(x, fp, ensure_ascii=False, indent=2)

## Set B APP6D

you need to clone https://github.com/spatialillusions/stanag-app6

### Get entities

In [None]:
result = dict()
for i in pathlib.Path('../stanag-app6/tsv-tables/app6d/').glob('*.tsv'):
    # get only entities
    if 'sector' in i.stem:
        continue
    try:
        x = dict()
        # create hierarchy JSON out of sparse Entity Collection dataframe
        #    A1 - - 1
        #    A2 - - 2  => {EC: {A1: 1, A2: {"": 2, B2: {"": 3, C2: 4}}}}
        #    - B2 - 3
        #    - - C2 4
        for _, row in pd.read_csv(i, delimiter='\t', keep_default_na=False, dtype='str').iterrows():
            # level A
            if row['Entity']:
                current_entity =  row['Entity']
                x[current_entity] = row['Code']
            # level B
            elif row['Entity Type']:
                current_entity_type = row['Entity Type']
                # make upper level dict, if it contains more than 1 value
                if not isinstance(x[current_entity], dict):
                    x[current_entity] = {"": x[current_entity]}
                # drop those
                if current_entity_type != '{Reserved for future use}':
                    x[current_entity][current_entity_type] = row['Code']
            # level C
            elif row['Entity Subtype']:
                current_entity_subtype = row['Entity Subtype']
                # make upper level dict, if it contains more than 1 value
                if not isinstance(x[current_entity][current_entity_type], dict):
                    x[current_entity][current_entity_type] = {"": x[current_entity][current_entity_type]}
                x[current_entity][current_entity_type][current_entity_subtype] = row['Code']
        result[i.stem] = x
    # 3 tab errors in Control Measures.tsv
    except Exception as e:
        print(e)
        print(i)

### Get modifiers

In [None]:
# get only files with modifiers
mods = [i for i in pathlib.Path('../stanag-app6/tsv-tables/app6d/').glob('*.tsv') if 'sector' in i.stem]

# add 'modifier_1' and 'modifier_2' dicts to some Entity Collections
#   find which EC have modifiers
for k in result.keys():
    for i in mods:
        if k + ' sector' in i.stem:
            # read modifier file
            x = dict()
            _ = pd.read_csv(i, delimiter='\t', keep_default_na=False, dtype='str') 
            # detect it modifier_1 or modifier_2
            n = '1' if ('1' in i.stem) else '2'
            modname = 'First Modifier' if (n == '1') else 'Second Modifier'
            try:
                for _, row in _.iterrows():
                    # drop some
                    if row[modname] not in ['{Reserved for future use}', 'Version Extension Flag']:
                        x.update({row[modname]: row['Code']})
                result[k][f'modifier_{n}'] = x
            # we clean
            except Exception as e:
                print(e)
                print(i.stem)

In [None]:
# {EC: {A1: 1, A2: {"": 2, B2: {"": 3, C2: 4}}}} => 
#   EC.A1: 1
#   EC.A2: 2
#   EC.A2.B2: 3
#   EC.A2.B2.C2: 4
with open('../json/set_b_app6d.json', 'w') as fp:
    x = pd.json_normalize(result).T.to_dict()[0]
    json.dump(x, fp, ensure_ascii=False, indent=2)