In [2]:
import pandas as pd
import requests
import sqlite3
import bs4
import os

"""
Author    : Joe Stephenson
Date      : April 19, 2020
Program   : Automating the collection of the FHIR medical standard.

I am proud of some of the code I have written. this, not so much. 

But it gets the job done. This will produce the most recent list of FHIR Resource. Well, untill they change the website ever so slightly
"""



def create_resource_row(temp_group_count, temp_category_count, temp_resource_name, temp_resource_description, temp_resource_url, c):
    insert_string = "INSERT INTO 'FHIR-Resources-List' VALUES ('"
    insert_string += str(temp_group_count) + "','" + str(temp_category_count) + "','" + str(temp_resource_name) + "','"   
    insert_string += str(temp_resource_description)   + "','" +  str(temp_resource_url) +"')"
    c.execute(insert_string)
                  
def create_resource_table(c):
    c.execute("DROP TABLE IF EXISTS 'FHIR-Resources-List'")
    create_string = "CREATE TABLE 'FHIR-Resources-List' "  
    create_string += """(Grouping TEXT,
                          Category TEXT,
                          Name TEXT,
                          Description TEXT,
                          URL TEXT
                        )"""
    c.execute(create_string)

def get_index_table():
    url = 'https://www.hl7.org/fhir/resourcelist.html'
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.content, 'html.parser')
    table = bs4.BeautifulSoup(str(soup.find_all('table')))
    return table

def resource_categories_dict(table):
    column_names = table.find_all(class_ = 'frm-group')
    count = 0
    grouping_category = {}
    for column in column_names:
        column_values = column.find_all(class_ = "frm-category")
        column_group = column.find(class_ = "frm-group rotate")
        if column_values != []:
            count += 1
            element_count = 0
            grouping_category.update({"group-"+str(count): column_group.text})
            for temp_value in column_values:
                grouping_category.update({"category-"+str(element_count)+"-"+str(count): temp_value.text})
                element_count += 1    
    return grouping_category

def resources_list(grouping_category, table, connection):
    column_content = table.find_all(class_ = 'frm-contents')
    create_resource_table(c)
    resources_csv = []
    for group_count in range(0,5):
        category_count = 0
        for resource_tile in column_content[group_count]:
            if isinstance(resource_tile, bs4.element.Tag):
                list_resource = resource_tile.find_all('li')
                for resource in list_resource:
                    resource = resource.find_all('a')
                    for temp_resource in resource:
                        if temp_resource['title'] != "Maturity Level" and temp_resource.text != "N":
                            temp_group_count = grouping_category["group-"+str(group_count+1)]
                            temp_category_count = grouping_category["category-"+str(category_count)+"-"+str(group_count+1)]
                            temp_resource_name = temp_resource.text
                            temp_resource_description = temp_resource['title']
                            temp_resource_description = temp_resource_description.replace("'", "")
                            temp_resource_url = "https://www.hl7.org/fhir/" + temp_resource['href']
                            resources_csv.append([temp_group_count, 
                                            temp_category_count, 
                                            temp_resource_name,
                                            temp_resource_description,
                                            temp_resource_url])
                            create_resource_row(temp_group_count, 
                                            temp_category_count, 
                                            temp_resource_name,
                                            temp_resource_description,
                                            temp_resource_url,
                                            connection)
                category_count += 1
    resources_csv = pd.DataFrame(resources_csv)
    resources_csv.columns = ['Group', 'Category', "Resource", "Description", "URL"]
    resources_csv.to_csv("data/FHIR-Resources.csv")  
    

def create_entry_row(c, row, resource_name):
    temp_definition = row[5].replace(str("\""), "")
    temp_definition = row[5].replace(str("\'"), "")
    insert_string = "INSERT INTO '"+ resource_name + "' VALUES ('"
    insert_string += str(row[0]) + "','" + str(row[1]) + "','" + str(row[2]) + "','"   
    insert_string +=str(row[3])   + "','" + str(row[4])   + "','" + temp_definition +"')"
    c.execute(insert_string)
                  
def create_entry_table(c, resource_name):
    c.execute("DROP TABLE IF EXISTS '"+ resource_name +"'")
    create_string = "CREATE TABLE '" + resource_name + "'"  
    create_string += """( Path TEXT,
                          Name TEXT,
                          Flag TEXT,
                          Card TEXT,
                          Type TEXT,
                          Description TEXT
                        )"""
    c.execute(create_string)

def get_resource_table(resource_url, resource_name):
    r = requests.get(resource_url)
    soup = bs4.BeautifulSoup(r.content, 'html.parser')
    table = soup.find_all("table", limit = 1, attrs={'border': '0', "cellpadding": "0", "cellspacing": "0"})
    table_list = []
    for element in table:
        resource_tr = element.find_all("tr")
        for tr_tags in resource_tr[1:-1]:
            count = 1
            td_tags = tr_tags.find_all("td")
            path_present = False
            row_list = []
            for element in td_tags:
                if count == 1:
                    try:
                        temp_a = element.find("a")
                        temp_href = temp_a["href"]
                        temp_href = temp_href.split("#")
                        if temp_href[0] != "datatypes.html":
                            stored_path = temp_href[1]
                            path_present = True
                            row_list.append(stored_path)
                    except:
                        path_present = False
                if path_present == True:
                    row_list.append(str(element.text))
                    if count == 2:
                        stored_flag = element.text
                    elif count == 3:
                        stored_card = element.text
                    elif count == 5:
                        stored_description = element.text
                else:
                    if element.text != "":
                        temp_type = element.text
                        temp_path = stored_path.split("_")
                        temp_path = temp_path[0] + temp_type.capitalize()
                        temp_name = temp_path.split(".")
                        temp_name = temp_name[-1]
                        row_list = [temp_path, temp_name, stored_flag, stored_card, temp_type, stored_description]
                count += 1
            if row_list[0][-1] != "_":
                table_list.append(row_list)     
    return table_list

if not os.path.exists('data/csv_files'):
    os.makedirs('data/csv_files')

database_directory = './FHIR-Resources.db'
conn = sqlite3.connect(database_directory)
c = conn.cursor()

resource_table = get_index_table()
grouping_category = resource_categories_dict(resource_table)         
resources_list(grouping_category, resource_table, c)


resources_found = pd.read_csv("data/FHIR-Resources.csv")
for index, row in resources_found.iterrows():
    temp_url, temp_name =  row['URL'], row['Resource']
    temp_table = get_resource_table(temp_url, temp_name)
    
    pandas_table = pd.DataFrame(temp_table)
    pandas_table.columns = ['Path', 'Name', "Flag", "Card", "Type", "Description"]
    pandas_table.to_csv("data/csv_files/"+temp_name+".csv")
    
    create_entry_table(c, temp_name)
    for row in temp_table:
        try: create_entry_row(c, row, temp_name)
        except Exception as e: print(e)
    print("Found data for", temp_name)

conn.commit()
conn.close()
        

Found data for CapabilityStatement
Found data for StructureDefinition
Found data for ImplementationGuide
Found data for SearchParameter
Found data for MessageDefinition
Found data for OperationDefinition
Found data for CompartmentDefinition
Found data for StructureMap
Found data for GraphDefinition
Found data for ExampleScenario
Found data for CodeSystem
Found data for ValueSet
Found data for ConceptMap
Found data for NamingSystem
Found data for TerminologyCapabilities
Found data for Provenance
Found data for AuditEvent
Found data for Consent
Found data for Composition
Found data for DocumentManifest
Found data for DocumentReference
Found data for CatalogEntry
Found data for Basic
Found data for Binary
Found data for Bundle
Found data for Linkage
Found data for MessageHeader
Found data for OperationOutcome
Found data for Parameters
Found data for Subscription
Found data for Patient
Found data for Practitioner
Found data for PractitionerRole
Found data for RelatedPerson
Found data for P