In [1]:
import bs4
import regex as re
import requests
import unicodedata
import time
import concurrent.futures
import pandas as pd

In [2]:
URL = 'https://en.wikipedia.org'
html = requests.get(URL + '/wiki/List_of_ICD-9_codes').text
soup = bs4.BeautifulSoup(html)

In [3]:
icd9_pages = []
for a in soup.find_all(href=re.compile('/wiki/List_of_ICD-9_codes_')):
    icd9_pages.append(URL + a['href'])

In [4]:
icd9_pages

['https://en.wikipedia.org/wiki/List_of_ICD-9_codes_001%E2%80%93139:_infectious_and_parasitic_diseases',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_140%E2%80%93239:_neoplasms',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_240%E2%80%93279:_endocrine,_nutritional_and_metabolic_diseases,_and_immunity_disorders',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_280%E2%80%93289:_diseases_of_the_blood_and_blood-forming_organs',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_290%E2%80%93319:_mental_disorders',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_320%E2%80%93389:_diseases_of_the_nervous_system_and_sense_organs',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_390%E2%80%93459:_diseases_of_the_circulatory_system',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_460%E2%80%93519:_diseases_of_the_respiratory_system',
 'https://en.wikipedia.org/wiki/List_of_ICD-9_codes_520%E2%80%93579:_diseases_of_the_digestive_system',
 'https://en.wikipedia.org/wiki/

In [5]:
def parse_bullet(bullet, url=URL, recursive=False):
    # icd9_code = bullet.find(href=re.compile('getICD9Code')).text
    # article = bullet.find(href=re.compile('/wiki/'))
    icd9_code = None
    urls = []
    if bullet.find('b', recursive=recursive):
        hrefs = bullet.find('b', recursive=recursive).findAll(href=True, recursive=recursive)
    else:
        hrefs = bullet.findAll(href=True, recursive=recursive)
    try:
        icd9_code = hrefs[0].text
        for href in hrefs[1:]:
            if '/wiki/' in href['href']:
                urls.append(url + href['href'])
    except IndexError:
        pass
    return icd9_code, urls

def parse_page(page):
    data = []
    soup = bs4.BeautifulSoup(page.text)
    body = soup.find(class_='mw-parser-output')
    lists = body.findChildren('ul', recursive=False)
    for section in lists:
        bullets = section.findChildren('li', recursive=False)
        for bullet in bullets:
            parent_code, parent_urls = parse_bullet(bullet)
            if parent_code:
                data.append((parent_code, parent_code, parent_urls, []))
            if bullet.find('ul'):
                sub_bullets = bullet.find('ul').findChildren('li', recursive=False)
                for sub_bullet in sub_bullets:
                    code, urls = parse_bullet(sub_bullet, recursive=True)
                    if code:
                        data.append((code, parent_code, urls, parent_urls))
                    if sub_bullet.find('ul'):
                        sub_bullets2 = sub_bullet.find('ul').findChildren('li', recursive=False)
                        for sub_bullet2 in sub_bullets2:
                            code2, urls2 = parse_bullet(sub_bullet2, recursive=True)
                            if code2:
                                data.append((code2, parent_code, urls2, parent_urls+urls))
    return data

def get_content(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text)
    paragraphs = [str(paragraph.text) for paragraph in soup.find_all('p')]
    headers = [str(paragraph.text) for paragraph in soup.find_all('span', attrs={'mw-headline'})]
    text = [val for pair in zip(paragraphs, headers) for val in pair]
    text = ' '.join(text)
    text = re.sub(r"\[.*?\]+", '', text)
    text = text.replace('\n', '')
    text = unicodedata.normalize('NFKD', text)
    return text

In [6]:
data = []
for url in icd9_pages:
    print(url)
    page = requests.get(url)
    part = parse_page(page)
    data += part

https://en.wikipedia.org/wiki/List_of_ICD-9_codes_001%E2%80%93139:_infectious_and_parasitic_diseases
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_140%E2%80%93239:_neoplasms
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_240%E2%80%93279:_endocrine,_nutritional_and_metabolic_diseases,_and_immunity_disorders
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_280%E2%80%93289:_diseases_of_the_blood_and_blood-forming_organs
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_290%E2%80%93319:_mental_disorders
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_320%E2%80%93389:_diseases_of_the_nervous_system_and_sense_organs
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_390%E2%80%93459:_diseases_of_the_circulatory_system
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_460%E2%80%93519:_diseases_of_the_respiratory_system
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_520%E2%80%93579:_diseases_of_the_digestive_system
https://en.wikipedia.org/wiki/List_of_ICD-9_codes_580%E2%80%93629:_d

In [7]:
def get_text(url):
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.text)
    paragraphs = [str(paragraph.text) for paragraph in soup.find_all('p')]
    headers = [str(paragraph.text) for paragraph in soup.find_all('span', attrs={'mw-headline'})]
    text = [val for pair in zip(paragraphs, headers) for val in pair]
    text = ' '.join(text)
    text = re.sub(r"\[.*?\]+", '', text)
    text = text.replace('\n', '')
    text = unicodedata.normalize('NFKD', text)
    return text, r.status_code

def get_content(url, texts, statuses):
    if not texts.get(url):
        text, status_code = get_text(url)
        texts[url] = text
        if status_code != 200:
            statuses.append(url)
        time.sleep(0.25)
        
def get_contents(urls, texts):
    statuses = []
    threads = min(MAX_THREADS, len(urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(lambda url: get_content(url, texts, statuses), urls)
    return statuses

In [8]:
all_urls = set()
for code, parent_code, urls, parent_urls in data:
    for url in urls:
        all_urls.add(url)
all_urls = list(all_urls)

In [9]:
len(all_urls)

2466

In [10]:
%%time
MAX_THREADS = 30
texts = {}
statuses = get_contents(all_urls, texts)

Wall time: 6min 48s


In [26]:
def all_urls(x):
    urls = set()
    x.apply(lambda lst: [urls.add(itm) for itm in lst])
    return list(urls)

In [32]:
df = pd.DataFrame(data, columns=['code', 'parent_code', 'urls', 'parent_urls'])
basic_codes = df.groupby('code_').apply(lambda x: all_urls(x['urls'])).reset_index()
basic_codes = basic_codes.rename({0: 'urls'}, axis=1)

In [39]:
def get_all_codes(article):
    out = []
    for _, code, urls in basic_codes.itertuples():
        if article in urls:
            out.append(code)
    return out

In [47]:
with open("wiki_2", 'wb') as f:
    for article, text in texts.items():
        codes = get_all_codes(article)
        codes = ' '.join([f'd_{code}' for code in codes])
        f.write(f'XXXdiseaseXXX  {codes}\n'.encode('utf8'))
        f.write(text.encode('utf8'))
        f.write('\nXXXendXXX\n'.encode('utf8'))