# Load uptodate json data

In [1]:
import json

with open('uptodate.json', 'r') as f:
    data = json.load(f)

#### Number of records

In [2]:
len(data)

18488

#### A sample record

In [3]:
data[100]

{'_id': {'$oid': '61db0221594f5b96006a4db9'},
 'topicInfo': {'isDrugLandingPage': False,
  'languageDisplayNames': {'ar': 'Arabic',
   'en-US': 'English',
   'es-419': 'Spanish'},
  'translatedTopicInfos': [{'displayName': 'Arabic',
    'specialtyId': '12',
    'lastMajorUpdateMs': '1639612800000',
    'id': '15480',
    'type': 'medical',
    'subtype': 'medical_basics',
    'version': '11.1',
    'title': 'توعية المرضى: الذئبة (الأساسيات)',
    'languageCode': 'ar',
    'languageCodes': ['ar', 'zh-Hans', 'en-US', 'es-419'],
    'url': '/contents/ar/lupus-the-basics',
    'translatedTitles': {'ja': '患者教育：全身性エリテマトーデス（簡易）'}},
   {'displayName': 'English',
    'specialtyId': '12',
    'lastMajorUpdateMs': '1639612800000',
    'id': '15480',
    'type': 'medical',
    'subtype': 'medical_basics',
    'version': '12.0',
    'title': 'Patient education: Lupus (The Basics)',
    'languageCode': 'en-US',
    'languageCodes': ['ar', 'zh-Hans', 'en-US', 'es-419'],
    'url': '/contents/lupus-th

# Removing useless info and seprationg paragraphs

In [4]:
from bs4 import BeautifulSoup

def get_paper_info(row):
    return {
        'id': row['topicInfo']['id'],
        'type': row['topicInfo']['type'],
        'subtype': row['topicInfo']['subtype'],
        'title': row['topicInfo']['title']
    }

def get_paragraphs(row):
    return_value = []
    idx = 1
    def add_parag(title, text):
        nonlocal idx
        if len(text) > 50:
            return_value.append((idx,title, text))
            idx += 1
    body = row['bodyHtml']
    soup = BeautifulSoup(body)
    for div in soup.find_all("span", {'class':'glyph'}): 
        div.decompose()
        
    if len(soup.find_all("div", {'class':'headingAnchor'})) == 0:
        return None
    
    parags = soup.find("div", {"id": "topicText"})
    
    parag_subtitle = None
    parag_text = ''
    
    
    
    for child in parags.children:
        if not hasattr(child, 'get'):
            return None
        
        child_class = child.get("class")
        if child_class is not None and 'headingAnchor' in child.get("class"):
            add_parag(parag_subtitle, parag_text)
                    
            heading_element = child.next_element
            parag_subtitle = heading_element.text
            child.next_element.decompose()
            if child.next_element is not None and child.next_element.has_attr("class") and 'headingEndMark' in child.next_element.get("class"):
                child.next_element.decompose()
            parag_text = child.text
        else:
            parag_text = parag_text + child.text
            
    add_parag(parag_subtitle, parag_text)

        
    return return_value

In [5]:
import tqdm
all_res = []
for row in tqdm.tqdm(data):
    info = get_paper_info(row)
    parags = get_paragraphs(row)
    if parags is not None:
        all_res.append({
            'info': info,
            'parags': parags
        })
        

100%|██████████| 18488/18488 [04:27<00:00, 69.08it/s] 


#### Number of remaining records

In [6]:
len(all_res)

9975

# Convert the cleaned data into a df format to save as CSV

In [14]:
import pandas as pd

def all_res_row_convert(row):
    return [{
        'paper_id': row['info']['id'],
        'parag_id': parag[0],
        'title': row['info']['title'],
        'subtitle': parag[1],
        'text': parag[2]
    } for parag in row['parags']]

def convert_to_df(rows):
    all_row_converted = []
    for row in rows:
        all_row_converted += all_res_row_convert(row)
    return pd.DataFrame(all_row_converted)

In [15]:
convert_to_df(all_res).to_csv('all_parags.csv', index=False)

# Sample from patient education papers for eval data

In [16]:
pe = []
for res in all_res:
    title = res['info']['title'].lower()
    if 'patient education' == title[:17]:
        pe.append(res)

pe_df = convert_to_df(pe)
pe_df_sample = pe_df.sample(frac=1/13).reset_index(drop=True)

In [17]:
pe_df_sample.shape[0]

537

In [18]:
pe_df_sample

Unnamed: 0,paper_id,parag_id,title,subtitle,text
0,883,4,Patient education: Prostate cancer screening (...,Ethnic background,African American men develop prostate cancer m...
1,4014,17,Patient education: The common cold in adults (...,The Basics,The Basics patient education pieces answer the...
2,3438,11,Patient education: Raynaud phenomenon (Beyond ...,WHERE TO GET MORE INFORMATION,Your health care provider is the best source o...
3,727,8,Patient education: Transient ischemic attack (...,Brain imaging,Depending upon the results of the history and ...
4,8418,4,Patient education: Infertility treatment with ...,WHO SHOULD CONSIDER INFERTILITY TREATMENT WITH...,There are two categories of women who may bene...
...,...,...,...,...,...
532,4009,6,Patient education: Acute sinusitis (sinus infe...,ACUTE SINUSITIS TREATMENT,The primary treatment for sinusitis involves s...
533,4004,15,Patient education: Acute bronchitis in adults ...,Professional level information,Professional level articles are designed to ke...
534,8419,12,Patient education: Hormonal methods of birth c...,Side effects,Possible side effects of the pill include:Naus...
535,6722,3,Patient education: Care during pregnancy for w...,General measures to control blood glucose,Women with type 2 diabetes who have been treat...


In [19]:
pe_df_sample.to_csv('eval_parags.csv', index=False)