# Building Knowledge Graph 

We'll be using the 211 Human Services Taxonomy and the corresponding Taxonomy codes from the IN 211 data to construct an initial KG of the data.  
Afterwards, we'll look for relations and connections between services

## 211 Taxonomy Refactor

The original schema of the taxonomy is very annoying to parse and has a lot of info we're not interested.  
We'll refactor it into a form that's easier to work with and save
several helper data files.  

We'll end with:
- taxo_nodes.json: a list of node dicts with their properties (except child nodes) 
- taxo_edge_list.json: a list of 3-tuple edges: 
    (node_num1, <Relation\>, node_num2)
- node_num_to_code.json: translates a unique node_num to it's taxonomy code
- code_to_node_num.json: same as above, but the inverse
- taxo_refs.csv: a list of synonyms or alternate names for the different taxonomy entities/terms. **Similar to user search queries so could be used for recommendation training**
- taxo_facets.csv: a list of intended use of the taxonomy entities/terms.  
    *Allowable values are: Service, Target, Organization/Facility Type, Modality, and Named Program*. Could be used as Main Categories


In [167]:
import xml.etree.ElementTree as ET 
import xmlschema
from os import path
import networkx as nx
import matplotlib.pyplot as plt
from sys import exit
import json
import csv

In [168]:
def save_to_json(taxo_dict, dst):
    taxo_json = json.dumps(taxo_dict)
    with open(dst, 'w+') as taxonomy_json:
        taxonomy_json.write(taxo_json)

def save_to_csv(taxo_list, dst):
    with open(dst, 'w+') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerows(taxo_list)

In [11]:
taxonomy_path = path.abspath('./data/currentTaxonomy_211.xml')
taxo_schema_path = path.abspath('./data/211_taxonomy_xml_schema.xsd')

# validate that taxonomy follows their own schema

if not path.isfile(taxo_schema_path):
    print("Fatal error! Taxonomy schema file {} does not exist".format(taxo_schema_path))
    exit(1)

elif not path.isfile(taxonomy_path):
    print("Fatal error! Taxonomy file {} does not exist".format(taxonomy_path))
    exit(1)
else:

    # describes taxonomy's structure, attributes, etc
    taxo_schema = xmlschema.XMLSchema(taxo_schema_path, converter=xmlschema.AbderaConverter)

    if taxo_schema.is_valid(taxonomy_path):
        
        # load the 211 Human Services taxonomy xml and as a json
        services_taxonomy = ET.parse(taxonomy_path)
        taxonomy_dict = taxo_schema.to_dict(taxonomy_path)
        taxo_root = services_taxonomy.getroot()
        
        # dict is very dense and wouldn't print without increasing print IO limit. 
        # save to file for analysis and choosing attrs to keep
        save_to_json(taxo_dict=taxonomy_dict, dst='./data/temp_taxo.json')
            
    else: 
        print("Error! The taxonomy doesn't follow provided schema and is invalid. List of errors:")
        for i, err in enumerate(schema_errs):
            print("{}. {}".format(i, err))

In [154]:
# extract info to graph out taxonomy 

%matplotlib inline

node_num = 0

# helper json or csv files to be saved later
node_num_to_code = {}
code_to_node_num = {}
taxo_nodes = {}
taxo_edgelist = set()
taxo_refs = set()
taxo_facets = set()

In [155]:
def parseNode(node_info, node_code):
    
    # added related key even if not present to work around 
    # key value type inconsistency
    global node_num
    node = {
        'node_num': node_num,
        'name': node_info['name'],
        'description': node_info['definition'],
        'facet': node_info['facet'],
        'code': node_code,
        'related': [],
        'child_nodes': []
    }
    
    # for translating related concepts later
    code_to_node_num[node_code] = node_num
    node_num_to_code[node_num] = node_code
    # categories 
    taxo_facets.add(node['facet'])
    taxo_refs.add(node['name'])
    
    
    # some nodes don't have these attributes
    if 'comments' in node_info:
        node['comments'] = node_info['comments']
        
    if 'useReference' in node_info:
        keywords = node_info['useReference']
        node['keywords'] = keywords
        # add the elements from the keywords list to the set
        [(lambda el: taxo_refs.add(el))(el) for el in keywords]
        
    if 'relatedConcept' in node_info:
        related = node_info['relatedConcept']
        # need to check if list or single dict (bad design again)
        # extract related concepts from XML structure
        if isinstance(related, list):
                rel_codes  = [rel['attributes']['code'] for rel in related]
        else:
            rel_codes = related['attributes']['code']
            
        # add rel codes to node. convert to list if str
        rel_codes = rel_codes if isinstance(rel_codes, list) else [rel_codes]
        node['related'] = node['related'] + rel_codes
            
    if 'seeAlso' in node_info:
        # need to check if list or single string (bad design)
        related = node_info['seeAlso']
        related = related if isinstance(related, list) else [related]
        
        node['related'] = node['related'] + related
    
    return node

In [156]:
def addRelatedNodes(node):
    for rel in node['related']:
        if rel in code_to_node_num:
            rel_node_num = code_to_node_num[rel]
            edge = (node['node_num'], 'isRelated', rel_node_num)
            taxo_edgelist.add(edge)
    

In [157]:
# goes over the OG taxonomy and refactors into easier to parse file and
# other helper files

def refactorTaxonomy(parent, children):
    
    # more type inconsistencies!! can be str or list 
    if not isinstance(children, list):
        children = [children]
    for child in children:
#         print(child)
        global node_num
        node_num += 1
#         print(node_num)
#         if (node_num == 436):
#             print(child)
#             print(children)
#             print(parent)
        child_code = child['attributes']['code']
        child_info = child['children'][0]
        child_node = parseNode(child_info, child_code)
        
        taxo_nodes[node_num] = child_node
        
        if 'record' not in child_info:
            parent['child_nodes'].append(child_node)
            # create edge 3-tuple 
            edge1 = (parent['node_num'], 'hasSubType', child_node['node_num'])
            edge2 = edge = (child_node['node_num'], 'isSubType', parent['node_num'])
            taxo_edgelist.add(edge1)
            taxo_edgelist.add(edge2)
            
        else:
            child_node['child_nodes'] = refactorTaxonomy(
                child_node, child_info['record'])
            
        
    return parent['child_nodes']

In [158]:
taxo_root_name = taxonomy_dict['attributes']['name']
refactor_taxo = {'name': taxo_root_name, 'child_nodes': [], 'node_num': 0}
general_categories = taxonomy_dict['children'][0]['record']

In [159]:
node_num = 0
refactor_taxo['child_nodes'] = refactorTaxonomy(parent=refactor_taxo, children=general_categories)

# need to add taxonomy term relations after parsing them all
for node_num in taxo_nodes:
    node = taxo_nodes[node_num]
    addRelatedNodes(node)

In [171]:
save_to_json(refactor_taxo, './data/211_taxonomy_kg_skel.json')
save_to_json(taxo_nodes, './data/211_taxonomy_nodes.json')
save_to_csv(taxo_edgelist, './data/211_taxonomy_edge_list.csv')
save_to_json(node_num_to_code, './data/node_num_to_code.json')
save_to_json(code_to_node_num, './data/code_to_node_num.json')
save_to_csv(list(taxo_refs), './data/211_taxonomy_term_references.csv')
save_to_csv(taxo_facets, './data/211_taxonomy_facets.csv')

In [160]:
print(len(taxo_nodes))
print(len(taxo_edgelist))
print(len(taxo_facets))
print(len(taxo_refs))

9974
38218
5
27696


{'Named Programs', 'Target', 'Service', 'Modality/Delivery Format', 'Organization/Facility Type'}


In [172]:
f_cnt = {}
for f in taxo_facets:
    f_cnt[f] = 0
    for node_num in taxo_nodes:
        node = taxo_nodes[node_num]
        if f in node['facet']:
            f_cnt[f] = f_cnt[f] + 1
            if (f == 'Target'): print(node['name'])

Target Populations
Agencies/Organizations as Recipients
Age Groups
Adults
Older Adults
Young Adults
Youth
Adolescents
Children
Preadolescent Children
Preschool Age Children
Primary School Age Children
Emancipated Minors
Infants/Toddlers
Infants
Toddlers
Latchkey Children
Newborns
Unaccompanied Minors
Benefits Recipients
Caregiver Subsidy Recipients
CHIP Recipients
Community Voicemail Recipients
Disability Benefit Recipients
Eligible Benefits Applicants
Ex-Public Assistance Recipients
Farm Aid Recipients
Food Stamps/SNAP Recipients
General Relief Recipients
In Home Supportive Services Subsidy Recipients
Long Term Welfare Recipients
Medicaid Recipients
Medicare Beneficiaries
Refugee/Entrant Cash Assistance Recipients
Social Security Retirement Recipients
SSI Recipients
State/Local Health Insurance Recipients
State Medicaid Waiver Service Recipients
TANF Recipients
Unemployment Recipients
Workers Compensation Recipients
Caregivers
Formal Caregivers
Informal Caregivers
Long Distance Caregi

In [170]:
print(f_cnt)

{'Named Programs': 217, 'Target': 2327, 'Service': 5934, 'Modality/Delivery Format': 65, 'Organization/Facility Type': 1431}
