In [33]:
import requests
from pyquery import PyQuery as pq
from slugify import slugify

### Load and parse data from https://company.auntbertha.com/the-open-eligibility-project

In [34]:
URL = 'https://company.auntbertha.com/the-open-eligibility-project/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0'
}

In [35]:
page = pq(requests.get(URL, headers=HEADERS).text)

In [36]:
container = pq(page('.entry-content .container'))

In [37]:
names = dict()

def get_item(*parts):
    if len(parts) == 1:
        name = parts[0]
        prefix = ''
    else:
        name = parts[1]
        prefix = parts[0] + ':'
    slug = slugify(name, separator='_')
    ret = dict(name=name, slug=prefix + slug)
    names.setdefault(name, []).append(ret)
    return ret

In [38]:
def parse_ul(section, element, level):
    items = section.setdefault('items', [])
    for el in element.children():
        if el.tag == 'li':
            header = el.text.strip().replace("\u2019", "'")
            item = get_item(section['slug'], header)
            print('>>'*level, item['slug'])
            items.append(item)
            ul = pq(el)('ul')
            if ul:
                item['items'] = []
                parse_ul(item, pq(ul), level+1)

In [39]:
sections = []
section = None
subsection = None
stack = []
for child in container.children():
    if child.tag == 'h3':
        if len(sections) == 2:
            break
        header = pq(child).text().strip()
        if header:
            section = get_item(header)
            sections.append(section)
            subsection = None
            print('===', section['slug'])
    elif child.tag == 'p':
        header = pq(child)('strong').text().strip()
        description = pq(child).text()
        if section is not None:
            if header:
                subsection = get_item(section['slug'], slugify(header))
                section.setdefault('items', []).append(subsection)
#                 print('>>', header)
                print('>>', subsection['slug'])
            elif description:
                current = subsection or section 
                current['description'] = description
                print('ddd:', description)

    elif child.tag == 'ul':
        current = subsection or section
        if current is not None:
            parse_ul(current, pq(child), 2)


=== human_services
>> human_services:food
ddd: Services for meals, food pantries, help paying for food, food delivery, food benefits, and nutrition support.
>>>> human_services:food:community_gardens
>>>> human_services:food:emergency_food
>>>> human_services:food:food_delivery
>>>> human_services:food:food_pantry
>>>> human_services:food:meals
>>>> human_services:food:help_pay_for_food
>>>>>> human_services:food:help_pay_for_food:government_food_benefits
>>>> human_services:food:nutrition_education
>> human_services:housing
ddd: Services for emergency, short- term and long- term housing, housing advice, help finding housing, and paying for housing.
>>>> human_services:housing:temporary_shelter
>>>>>> human_services:housing:temporary_shelter:weather_relief
>>>> human_services:housing:help_find_housing
>>>> human_services:housing:help_pay_for_housing
>>>>>> human_services:housing:help_pay_for_housing:help_pay_for_utilities
>>>>>> human_services:housing:help_pay_for_housing:help_pay_for_

>>>>>> human_situations:health:post_treatment
>>>>>> human_situations:health:facing_end_of_life
>>>>>> human_situations:health:all_cancer_types
>>>>>> human_situations:health:bladder_cancer
>>>>>> human_situations:health:brain_stem_glioma
>>>>>> human_situations:health:brain_tumor
>>>>>> human_situations:health:breast_cancer
>>>>>> human_situations:health:central_nervous_system
>>>>>> human_situations:health:cervical_cancer
>>>>>> human_situations:health:colon_cancer
>>>>>> human_situations:health:colorectal_cancer
>>>>>> human_situations:health:desmoid_tumors
>>>>>> human_situations:health:ductal_carcinoma_in_situ_dcis
>>>>>> human_situations:health:endomentrial_cancer
>>>>>> human_situations:health:gastrointestinal_stromal_tumors_gist
>>>>>> human_situations:health:germ_cell_tumor
>>>>>>>> human_situations:health:germ_cell_tumor:central_nervous_system_germ_cell_tumor
>>>>>>>> human_situations:health:germ_cell_tumor:ovarian_germ_cell_tumor
>>>>>>>> human_situations:health:germ_cell_tu

### Load and parse data from https://github.com/auntbertha/openeligibility/blob/master/taxonomy

In [40]:
from lxml import etree
import requests
from io import StringIO

content = open('../taxonomy', 'rb').read()
content = content.replace(b'version="2.0"', b'version="1.0"')
tree = etree.fromstring(content)

checks_map = dict()

for top in tree.iterchildren():
    if top.tag in ('services', 'situations'):
        checks = checks_map.setdefault(top.tag, [])
        desc=dict()
        for el1 in top.iterchildren():
            if el1.tag == 'descriptions':
                for el2 in el1.iterchildren():
                    desc[el2.attrib['id']] = el2.text.strip()
        for el1 in top.iterchildren():
            if el1.tag == 'top_level':
                checks.append(([el1.attrib['title']], desc.get(el1.attrib['id'])))
#                 out.write(el1.attrib['title'] + ',,,"' + desc.get(el1.attrib['id'], '') + '"\n')
                for el2 in el1.iterchildren():
                    if el2.tag == 'second_level':
                        checks.append(([el1.attrib['title'], el2.attrib['title']],))
                        for el3 in el2.iterchildren():
                            if el3.tag == 'third_level':
                                checks.append(([el1.attrib['title'], el2.attrib['title'], el3.attrib['title']],))
                                for el4 in el3.iterchildren():
                                    if el4.tag == 'fourth_level':
                                        checks.append(([el1.attrib['title'], el2.attrib['title'], el3.attrib['title'], el3.attrib['title']],))

print(checks)

[(['General'], None), (['General', 'Anyone in Need'],), (['Age Group'], None), (['Age Group', 'Adults'],), (['Age Group', 'Teens'],), (['Age Group', 'Children'],), (['Age Group', 'Seniors'],), (['Armed Forces'], None), (['Armed Forces', 'Active Duty'],), (['Armed Forces', 'National Guard'],), (['Armed Forces', 'Veterans'],), (['Citizenship'], None), (['Citizenship', 'Immigrants'],), (['Citizenship', 'Refugees'],), (['Citizenship', 'Undocumented'],), (['Criminal History'], None), (['Criminal History', 'Ex-Offenders'],), (['Disability'], None), (['Disability', 'Learning Disability'],), (['Disability', 'Developmental Disability'],), (['Disability', 'Physical Disability'],), (['Disability', 'Intellectual Disability'],), (['Disability', 'Mentally Incapacitated'],), (['Disability', 'Limited Mobility'],), (['Disability', 'Hearing Impairment'],), (['Disability', 'Visual Impairment'],), (['Disability', 'Mental Illness'],), (['Education'], None), (['Education', 'Dropouts'],), (['Education', 'Stu

### Consolidate the two sources

In [41]:
for tag, checks in checks_map.items():
    tree = sections[dict(services=0, situations=1)[tag]]
    for check in checks:
        node = tree
        for i, c in enumerate(check[0]):
            nodes = [n for n in node.setdefault('items', []) if n['name'] == c]
            if len(nodes) == 0:
                print('MISSING', tag, check)
                _node = get_item(node['slug'], c)
                node['items'].append(_node)
                node = _node
            else:
                node = nodes[0]
            

MISSING services (['Emergency'], 'Services for psychiatric emergencies, emergency financial assistance, immediate safety needs, disaster preparedness and disaster response.')
MISSING services (['Emergency', 'Disaster Response'],)
MISSING services (['Emergency', 'Emergency Payments'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for Food'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for Healthcare'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for Housing'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for Gas'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for School'],)
MISSING services (['Emergency', 'Emergency Payments', 'Help Pay for Utilities'],)
MISSING services (['Emergency', 'Emergency Food'],)
MISSING services (['Emergency', 'Emergency Shelter'],)
MISSING services (['Emergency', 'Help Find Missing Persons'],)
MISSING services (['Emergency', 'Immediate Safety'],)
MISSING servi

### Check additions from openreferral/openeligibility and ensure we're not missing anything

In [42]:
import io
import csv

openreferral_additions = """
1391,"Adoption Counseling",1271,"Adoption & Foster Care","Open Eligibility"
1392,"Anger Management",1420,"Mental Health Care","Open Eligibility"
1393,"Before School Care",1277,"Daytime Care","Open Eligibility"
1394,"Business Loans",1415,"Loans","Open Eligibility"
1395,"Case Management",1292,"Navigating the System","Open Eligibility"
1396,"Community Support Services",1108,"Care","Open Eligibility"
1397,"Computer or Internet Access",1396,"Community Support Services","Open Eligibility"
1398,"Drug Testing",1178,"Addiction & Recovery","Open Eligibility"
1399,"Early Childhood Intervention",1219,"Prevent & Treat","Open Eligibility"
1400,"Efficiency Upgrades",1141,"Maintenance & Repairs","Open Eligibility"
1401,"Efficient Appliances",1163,"Home Goods","Open Eligibility"
1402,"Exercise & Fitness",1206,"Medical Care","Open Eligibility"
1403,"Family Counseling",1420,"Mental Health Care","Open Eligibility"
1404,"Fertility",1431,"Sexual & Reproductive Health","Open Eligibility"
1405,"Government Food Benefits",1405,"Government Food Benefits","Open Eligibility"
1406,"Group Therapy",1220,"Counseling","Open Eligibility"
1407,"Head Start",1277,"Daytime Care","Open Eligibility"
1408,"Health & Safety",1141,"Maintenance & Repairs","Open Eligibility"
1409,"Help Find Healthcare",1206,"Medical Care","Open Eligibility"
1410,"Help Pay for Car",1105,"Transit","Open Eligibility"
1411,"Help Pay for Internet or Phone",1116,"Help Pay for Housing","Open Eligibility"
1412,"Home Fuels",1163,"Home Goods","Open Eligibility"
1413,"Hospital Treatment",1219,"Prevent & Treat","Open Eligibility"
1414,"Individual Counseling",1220,"Counseling","Open Eligibility"
1415,"Loans",1107,"Money","Open Eligibility"
1416,"Meals",1102,"Food","Open Eligibility"
1417,"Medication Management",1219,"Prevent & Treat","Open Eligibility"
1418,"Medications for Addiction",1178,"Addiction & Recovery","Open Eligibility"
1419,"Medications for Mental Health",1420,"Mental Health Care","Open Eligibility"
1420,"Mental Health Care",1106,"Health","Open Eligibility"
1421,"Occupational Therapy",1219,"Prevent & Treat","Open Eligibility"
1422,"Peer Recovery Coaching",1303,"Support Network","Open Eligibility"
1423,"Personal Loans",1415,"Loans","Open Eligibility"
1424,"Personal Safety",1104,"Goods","Open Eligibility"
1425,"Pest Control",1141,"Maintenance & Repairs","Open Eligibility"
1426,"Physical Safety",1108,"Care","Open Eligibility"
1427,"Postnatal Care",1441,"Women's Health","Open Eligibility"
1428,"Primary Care",1206,"Medical Care","Open Eligibility"
1429,"Prosthesis",1169,"Medical Supplies","Open Eligibility"
1430,"Safety Education",1426,"Physical Safety","Open Eligibility"
1431,"Sexual & Reproductive Health",1219,"Prevent & Treat","Open Eligibility"
1432,"Skilled Nursing",1206,"Medical Care","Open Eligibility"
1433,"Speech Therapy",1219,"Prevent & Treat","Open Eligibility"
1434,"STD/STI Treatment & Prevention",1431,"Sexual & Reproductive Health","Open Eligibility"
1435,"Substance Abuse Counseling",1220,"Counseling","Open Eligibility"
1436,"Supplies for School",1163,"Home Goods","Open Eligibility"
1437,"Support & Service Animals",1219,"Prevent & Treat","Open Eligibility"
1438,"Temporary Shelter",1426,"Physical Safety","Open Eligibility"
1439,"Vision Care",1106,"Health","Open Eligibility"
1440,"Weather Relief",1396,"Community Support Services","Open Eligibility"
1441,"Women's Health",1431,"Sexual & Reproductive Health","Open Eligibility"
""".strip()

r = csv.reader(io.StringIO(openreferral_additions))

In [43]:
for line in r:
    name=line[1]
    assert name in names, "NAME: " + name

### Mark equivalent slugs

In [44]:
for name, sects in names.items():
    slugs = sorted(set(s['slug'] for s in sects))
    if len(slugs) > 1:
        for section in sects:
            section['related'] = [s for s in slugs if section['slug'] != s]

### Save the result in YAML format

In [45]:
import yaml
yaml.dump(sections, open('taxonomy.yaml', 'w'), sort_keys=False, width=240)

In [46]:
sections

[{'name': 'Human Services',
  'slug': 'human_services',
  'items': [{'name': 'food',
    'slug': 'human_services:food',
    'description': 'Services for meals, food pantries, help paying for food, food delivery, food benefits, and nutrition support.',
    'items': [{'name': 'Community Gardens',
      'slug': 'human_services:food:community_gardens'},
     {'name': 'Emergency Food',
      'slug': 'human_services:food:emergency_food',
      'related': ['human_services:care:physical_safety:emergency_food',
       'human_services:emergency:emergency_food']},
     {'name': 'Food Delivery', 'slug': 'human_services:food:food_delivery'},
     {'name': 'Food Pantry', 'slug': 'human_services:food:food_pantry'},
     {'name': 'Meals', 'slug': 'human_services:food:meals'},
     {'name': 'Help Pay for Food',
      'slug': 'human_services:food:help_pay_for_food',
      'items': [{'name': 'Government Food Benefits',
        'slug': 'human_services:food:help_pay_for_food:government_food_benefits',
    