In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os, sys, time

# url for 510k SaMD details page
eq_url = 'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID='
# url for De Novo SaMD details page
denovo_url = 'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/denovo.cfm?id='
# url for product codes
product_code_url = 'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode='

In [3]:
# download base submission data (requires manual table extraction to kickstart)
#   - TODO: provide instructions for table extraction.

cols = []
data = []

soup = BeautifulSoup(open('aiml_samd_09222021.htm'))
for header in soup.findAll("th"):
    cols.append(header.string.lower().replace(' ', "_")) 

cols.append('submission_type')

for i, row in enumerate(soup.findAll('tr')):
    if i == 0: continue
    row_data = []
    for cell in row.findAll('td'):
        row_data.append(cell.string)

    if row_data[1].startswith('K'):
        row_data.append('510k')
    elif row_data[1].startswith('DEN'):
        row_data.append('DENOVO')
    else:
        row_data.append(None)
        
    data.append(row_data)

df_submissions = pd.DataFrame(data, columns=cols)

product_codes = df_submissions.primary_product_code.unique().tolist()
print(product_codes)

df_submissions.head(10)

['JAK', 'LLZ', 'MXD', 'PIB', 'QPF', 'QAS', 'KPS', 'QIH', 'QKB', 'MUJ', 'QFM', 'QNP', 'MQB', 'QOK', 'OEB', 'LNH', 'QME', 'POK', 'QNV', 'NQQ', 'QNL', 'QDQ', 'POV', 'JOY', 'DQD', 'QJU', 'DQK', 'PRH', 'IYN', 'MLO', 'QBS', 'IYO', 'DSI', 'DPS', 'IYE', 'MWI', 'BZG', 'QEK', 'HAW', 'QJB', 'MRZ', 'MUD', 'GKZ', 'JAA', 'PIW', 'GXY', 'PLB', 'PJA', 'JFY', 'PBH', 'QER', 'QAQ', 'PPU', 'OZE', 'DQA', 'DXH', 'NDC', 'QEA', 'POS', 'PCS', 'OMB', 'GWN', 'JIL', 'OLO', 'QCC', 'NAY', 'OBH', 'PNN', 'DSB', 'DRG', 'DSH', 'DPT', 'MNR', 'PTA', 'OLZ', 'NBW', 'MYN', 'PBZ', 'OWB', 'NFJ', 'KPR', 'PEX', 'OIW', 'MHX']


Unnamed: 0,date_of_final_decision,submission_number,device,company,panel_(lead),primary_product_code,submission_type
0,06/17/2021,K203514,Precise Position,"Philips Healthcare (Suzhou) Co., Ltd.",Radiology,JAK,510k
1,06/16/2021,K202718,Qmenta Care Platform Family,"Mint Labs, Inc., D/B/A. QMENTA",Radiology,LLZ,510k
2,06/11/2021,K210484,"LINQ II Insertable Cardiac Monitor, Zelda AI E...","Medtronic, Inc.",Cardiovascular,MXD,510k
3,06/10/2021,K203629,IDx-DR,Digital Diagnostics Inc.,Ophthalmic,PIB,510k
4,06/02/2021,DEN200069,Cognoa Asd Diagnosis Aid,"Cognoa, Inc.",Neurology,QPF,DENOVO
5,05/19/2021,K210237,CINA CHEST,Avicenna.AI,Radiology,QAS,510k
6,04/30/2021,K210001,HYPER AiR,"Shanghai United Imaging Healthcare Co.,Ltd.",Radiology,KPS,510k
7,04/23/2021,K203314,Cartesion Prime (PCD-1000A/3) V10.8,Canon Medical Systems Corporation,Radiology,KPS,510k
8,04/23/2021,K203502,MEDO-Thyroid,MEDO DX Pte. Ltd.,Radiology,QIH,510k
9,04/21/2021,K210556,Preview Shoulder,Genesis Software Innovations,Radiology,QIH,510k


In [8]:
# download submission pages
for i, row in df_submissions.iterrows():
    if row.submission_type == '510k':
        url = eq_url + row.submission_number
    if row.submission_type == 'DENOVO':
        url = denovo_url + row.submission_number 
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    data_path = os.path.join('data', 'submission_html')
    os.makedirs(data_path, exist_ok=True)

    with open(os.path.join(data_path, f'{row.submission_number}.htm'), 'w') as f:
        f.write(str(soup))

    print(url, f'{i+1}/{len(df_submissions)}')

https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203514 0/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K202718 1/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K210484 2/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203629 3/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/denovo.cfm?id=DEN200069 4/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K210237 5/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K210001 6/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203314 7/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203502 8/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K210556 9/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203610 10/343
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K203469 11/343
https://w

In [47]:
# extract submission tables
def extract_fda_data_table(data_path, id, table_chars):
    soup = BeautifulSoup(open(os.path.join(data_path, f'{id}.htm')))
    t = soup.find('table', table_chars)

    if t is not None:
        with open(os.path.join(os.path.join(data_path, f'{id}_table.htm')), 'w') as f:
            f.write(str(t.prettify()))

data_path = os.path.join('data', 'submission_html')
os.makedirs(data_path, exist_ok=True)
for i, row in df_submissions.iterrows():
    if row.submission_type == 'DENOVO':
        extract_fda_data_table(
            data_path, 
            row.submission_number, 
            {'style': 'text-transform: capitalize; table-layout:fixed; width:500px'}
        )
    if row.submission_type == '510k':
        extract_fda_data_table(
            data_path,
            row.submission_number,
            {'style': 'text-transform: none'}
        )

data_path = os.path.join('data', 'product_code_html')
os.makedirs(data_path, exist_ok=True)
for code in product_codes: 
    extract_fda_data_table(
        data_path, 
        code, 
        {'width': '600', 'cellspacing': '5'}
    )

In [9]:
# generate full submission table

# extract submission headers

df_submission_denovo = df_submissions[df_submissions.submission_type == 'DENOVO']

submission_denovo_headers = set()
for i, row in df_submission_denovo.iterrows():
    soup = BeautifulSoup(open(os.path.join('data', 'submission_html', f'{row.submission_number}_table.htm')))
    headers = []
    for header in soup.findAll('th'):
        headers.append(header.string.replace('\n', '').strip())
    submission_denovo_headers = submission_denovo_headers | set(headers)

submission_denovo_headers = list(submission_denovo_headers)

denovo_data = []
for i, sub in df_submission_denovo.iterrows():
    soup = BeautifulSoup(open(os.path.join('data', 'submission_html', f'{sub.submission_number}_table.htm')))
    obj = { key: None for key in submission_denovo_headers }

    table = soup.table
    # print(table)
    for row in table.findAll('tr'):
        try: 
            header = row.th.string.replace('\n', '').strip()
            data = row.td
            
            if data.a is not None:
                link = data.a.get('href')
                url = requests.compat.urlparse(link)

                print(url)

                data = data = f'[{data.get_text()}]'

            elif data.table is not None:
                # data = ' '.join([x.text for x in data.table.findAll('td')])
                data = data.get_text()
            else:
                data = data.get_text()

            data = data \
                .replace('\r', '') \
                .replace('\n', '') \
                .replace('\t', '') \
                .strip()

            obj[header] = data
        except: continue
    
    denovo_data.append(obj)
    
df_denovo = pd.DataFrame.from_dict(denovo_data)
df_denovo.head()


ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfpcd/classification.cfm', params='', query='start_search=1&productcode=QPF', fragment='')
ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfcfr/cfrsearch.cfm', params='', query='fr=882.1491', fragment='')
ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfpcd/classification.cfm', params='', query='start_search=1&productcode=QPF', fragment='')
ParseResult(scheme='https', netloc='www.accessdata.fda.gov', path='/cdrh_docs/pdf20/DEN200069.pdf', params='', query='', fragment='')
ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfpcd/classification.cfm', params='', query='start_search=1&productcode=QNP', fragment='')
ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfcfr/cfrsearch.cfm', params='', query='fr=876.1520', fragment='')
ParseResult(scheme='', netloc='', path='/scripts/cdrh/cfdocs/cfpcd/classification.cfm', params='', query='start_search=1&productcode=QNP', fragment='')
Pa

Unnamed: 0,De Novo Number,Classification Advisory Committee,FDA Review,Regulation Number,Requester,Device Name,510(K) Number,Type,Date Received,Expedited Review,Review Advisory Committee,Decision,Device Classification Name,Decision Date,Reclassification Order,Contact,Classification Product Code
0,DEN200069,Neurology,,[ 882.1491 ],"Cognoa, Inc. 2185 park blvd. ...",Cognoa ASD Diagnosis Aid,,Direct,11/03/2020,,Neurology,granted (DENG),[ pediatric autism spectrum disorder diagno...,06/02/2021,[ Reclassification Order ],sophie dessalle,[ QPF ]
1,DEN200055,Gastroenterology/Urology,,[ 876.1520 ],"Cosmo Artificial Intelligence - AI, LTD ...",GI Genius,,Direct,09/08/2020,,Gastroenterology/Urology,granted (DENG),[ gastrointesinal lesion software detection...,04/09/2021,[ Reclassification Order ],steven a. kradjian,[ QNP ]
2,DEN200038,Cardiovascular,[ Decision Summary ],[ 870.2786 ],ContinUse Biometrics Ltd. habarzel 3...,Gili Pro BioSensor (also known as “Gili BioSen...,,Direct,06/12/2020,,Cardiovascular,granted (DENG),[ hardware and software for optical camera-...,04/01/2021,[ Reclassification Order ],sagi polani,[ QOK ]
3,DEN200019,Cardiovascular,,[ 870.2785 ],Oxehealth Limited magdalen center no...,Oxehealth Vital Signs,,Direct,03/27/2020,,Cardiovascular,granted (DENG),[ software for optical camera-based measure...,03/26/2021,[ Reclassification Order ],hugh lloyd-jukes,[ QME ]
4,DEN200022,Cardiovascular,,[ 870.2220 ],"Fifth Eye Inc. 110 miller avenue, su...",Analytic for Hemodynamic Instability (AHI),,Direct,04/03/2020,,Cardiovascular,granted (DENG),[ adjunctive hemodynamic indicator with dec...,03/01/2021,[ Reclassification Order ],jennifer a baird,[ QNV ]


In [None]:
df_submission_510k = df_submissions[df_submissions.submission_type == '510k']
submission_510k_headers = set()
for i, row in df_submission_510k.iterrows():
    soup = BeautifulSoup(open(os.path.join('data', 'submission_html', f'{row.submission_number}_table.htm')))
    headers = []
    for header in soup.findAll('th'):
        headers.append(header.string)
    submission_510k_headers = submission_510k_headers | set(headers)

print(list(submission_510k_headers))

In [17]:
# download submission pdfs (DOES NOT WORK, LINK IS DYNAMIC BY FIRST SUB DATE?)
# import requests

# url = 'http://www.hrecos.org//images/Data/forweb/HRTVBSH.Metadata.pdf'
# r = requests.get(url, stream=True)

# with open('/tmp/metadata.pdf', 'wb') as fd:
#     for chunk in r.iter_content(chunk_size):
#         fd.write(chunk)

for i, row in df_submissions.iterrows():

    year = row.date_of_final_decision[-2:]

    url = f'https://www.accessdata.fda.gov/cdrh_docs/pdf{year}/{row.submission_number}.pdf'
    res = requests.get(url)

    data_path = os.path.join('data', 'submission_pdf')
    os.makedirs(data_path, exist_ok=True)

    with open(os.path.join(data_path, f'{row.submission_number}.pdf'), 'wb') as f:
        f.write(res.content)
    
    print(f'{url} {i+1}/{len(df_submissions)}')
    time.sleep(2)
    
        

https://www.accessdata.fda.gov/cdrh_docs/pdf21/K203514.pdf 1/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/K202718.pdf 2/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/K210484.pdf 3/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/K203629.pdf 4/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/DEN200069.pdf 5/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/K210237.pdf 6/343
https://www.accessdata.fda.gov/cdrh_docs/pdf21/K210001.pdf 7/343


KeyboardInterrupt: 

In [4]:
cols = []
data = []

for i, pc in enumerate(product_codes):

    url = product_code_url + pc
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    data_path = os.path.join('data', 'product_code_html')
    os.makedirs(data_path, exist_ok=True)
    
    with open(os.path.join(data_path, f'{pc}.htm'), 'w') as f:
        f.write(str(soup))

    print(url)

#     tables = soup.findAll('table')
#     table = tables[4]

#     if i == 0:
#         for row in table.findAll('tr'):
#             for header in row.findAll('th'):
#                 cols.append(header.text)

#     row_data = []
#     for row in table.findAll('tr'):
#         for header in row.findAll('th'):
#             row_data.append(header.findNext('td').text.replace('\t', '').replace('\n', '').replace('\r', ''))
        
#     data.append(row_data)
#     print(url, len(cols), len(row_data))
#     print(cols)
#     print(row_data)
#     print('---')

# df_products = pd.DataFrame(data, columns=cols)
# df_products.head()

https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=JAK
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=LLZ
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=MXD
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=PIB
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=QPF
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=QAS
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=KPS
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=QIH
https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpcd/classification.cfm?start_search=1&amp;productcode=QKB
h