In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# create file handler which logs even debug messages
fh1 = logging.FileHandler('log.html',mode='w')
fh1.setLevel(logging.DEBUG)

# create console handler with a higher log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR) #------------------change your log setting here DEBUG INFO WARNING ERROR CRITICAL

formatter = logging.Formatter('<p> %(asctime)s - %(name)s - %(levelname)s - %(message)s <p>' , datefmt='%d-%b-%y %H:%M:%S')
fh1.setFormatter(formatter)
console_handler.setFormatter(formatter)

# add the handlers to the logger
logger.addHandler(fh1)
logger.addHandler(console_handler)

In [2]:
"""
This is a modified version of code from: 
1. https://lawlesst.github.io/notebook/sparql-dataframe.html
2. https://github.com/SuLab/sparql_to_pandas/blob/master/SPARQL_pandas.ipynb

Demonstrating how to get JupyterLab working with Binder: 

https://github.com/binder-examples/jupyterlab
https://github.com/binder-examples/jupyter-extension/blob/master/index.ipynb
"""
import pandas as pd
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import ipywidgets as widgets
from urllib.parse import unquote

def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

#help: https://en.wikibooks.org/wiki/SPARQL/Wikidata_Query_Service_-_Introduction

wds = "https://query.wikidata.org/sparql"
rq = """
SELECT ?WkD_disease ?WkD_diseaseLabel ?WP_en_article 
WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  ?WkD_disease wdt:P31 wd:Q12136.
  
      OPTIONAL {
      ?WP_en_article schema:about ?WkD_disease .
      ?WP_en_article schema:inLanguage "en" .
      ?WP_en_article schema:isPartOf <https://en.wikipedia.org/> .
    }
}
#order by desc(?WkD_disease)
"""

df = get_sparql_dataframe(wds, rq)

In [3]:
"""
This is a modified version of code from: 
1. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d
2. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d

# Full discussion:
# https://marcobonzanini.wordpress.com/2015/01/12/searching-pubmed-with-python/
"""

from Bio import Entrez
from IPython.core.display import display, HTML
from urllib.error import HTTPError

import time

def search(index,query):
    Entrez.email = 'your.email@example.com'
    
    logger.debug (f"{index:>5}. PubMed search query: {query}")
    handle = Entrez.esearch(db='pubmed', sort='relevance', retmax='200', retmode='xml', term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(index, id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'

    sleep_time = 10
    num_retries = 1000
    error = None
    for x in range(0, num_retries):  
        try:
            logger.debug (f"{index:>5}. Fetching article details for PMIDs: {ids}")
            handle  = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
            results = Entrez.read(handle)
            error = None
        except Exception as error:
            logger.error (f"{index:>5}. I got a HTTPError - reason {error}- while trying to fetch the articles details from pubmed")
            logger.error (f"{index:>5}. Sleeping for {sleep_time} seconds before trying again...")
            sleep(sleep_time)  # wait before trying to fetch the data again
            sleep_time *= 1.5  # Implement your backoff algorithm here i.e. exponential backoff
            pass

        if not error:
            break
            
    if error:
        logger.critical (f"{index:>5}. Retried fetching article's details {num_retries} times with no success!")
        raise error
    return results

def pubmed_search(index, disease, searchTitle, searchAbstract):
    
    logger.debug (f"{index:>5}. Searching PubMed for {disease} in searchTitle:{searchTitle} , searchAbstract:{searchAbstract}")
    if searchTitle and searchAbstract:
        results = search(index,"("+disease+'[Title/Abstract] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
    if searchTitle and not searchAbstract:
        results = search(index,"("+disease+'[Title] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
        
    #pprint.pprint (results)
    return results['IdList']

# display(len(pubmed_search("Crohn's disease",True,True)))
# display(len(pubmed_search("Crohn's disease",True,False)))

In [4]:
# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process
# import re

# def searh_wp_refs_4title(pubmedArticleTitle,WPpageTitle):
#     ''' looksup an article title in a WikiPedia page '''
#     #display (wikipedia.WikipediaPage(WPpageTitle).html())
#     WPpageHTML= wikipedia.WikipediaPage(WPpageTitle).html()
#     ratio = fuzz.ratio(WPpageHTML, pubmedArticleTitle)
#     print ("fuzz.ratio:",ratio)

#     WPpageHTML=re.sub(r'\W+', '', WPpageHTML.lower())
#     pubmedArticleTitle=re.sub(r'\W+', '', pubmedArticleTitle.lower())
#     print(pubmedArticleTitle)
#     if pubmedArticleTitle in WPpageHTML:
#         return True
#     else:
#         return False
    
#print (searh_wp_refs_4title("interventions to slow progression of myopia in children...","Near-sightedness"))

def searh_wp_refs_4PMID(PMID,en_article_HTML):
    ''' looksup a PMIDs in a WikiPedia page '''  

    #print('searching for PMID: <b>'+PMID+ '</b>')
    if PMID in en_article_HTML:
        #print ('found')
        return True
    else:
        #print ('not found')
        return False
    
#en_article_HTML= wikipedia.WikipediaPage('Near-sightedness').html()
#print (searh_wp_refs_4PMID("22161388",en_article_HTML))


In [5]:
import wikipedia
from tqdm import tnrange, tqdm_notebook
from urllib.parse import unquote
from IPython.display import clear_output
from IPython.core.display import display, HTML

import ipywidgets as widgets
from ipywidgets import HBox, VBox

from operator import itemgetter
from collections import OrderedDict

import time

import html2text
h2t = html2text.HTML2Text()
h2t.ignore_links = True

import re

df['class']=None
df['importance']=None
df['taskForces']=None
df['cochrane_reviews_html']=None
df['cochrane_reviews']=None
df['talkPage_categories']=None

disease_search_log= widgets.HTML()
citation_search_log= widgets.HTML()

display(disease_search_log)
display(citation_search_log)
for index, row in tqdm_notebook(df.iterrows(),desc='Progress',total=df['WP_en_article'].count(), unit="WkD_disease"):
    if row['WP_en_article'] is not None:
        #and row['WP_en_article']=="https://en.wikipedia.org/wiki/Agalactia":
        if console_handler.level<=20:
            display(HTML(f"<b>START of row {index:>5}<b>"))
        else:
            logger.info(f"<{index:>5}------------------------------------------------")

        disease_search_log.value= f'<p style="color:blue;"><b>processing:</b> disease #{index:>5}\
            &emsp; {row["WkD_diseaseLabel"]} &emsp; {row["WkD_disease"]} &emsp; {row["WP_en_article"]} </p>'
        WP_en_article_Title= unquote(unquote(row['WP_en_article'][30:]))
        
        #will raise a DisambiguationError if the page is a disambiguation page, or a PageError if 
        #the page doesn’t exist (although by default, it tries to find the page you meant with suggest and search.)
        try:
            logger.info(f"{index:>5}. Getting the WikiPedia content for: {WP_en_article_Title}")
            WP_en_article_obj = wikipedia.WikipediaPage(WP_en_article_Title)
            WP_en_article_HTML = WP_en_article_obj.html()
            
            #checking for redirects
            if WP_en_article_Title.casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
                logger.info(f"{index:>5}. Redirected to: {WP_en_article_obj.title}")            
            
            logger.info(f"{index:>5}. Getting the WikiPedia talkPage_categories for: {WP_en_article_obj.title}")
            WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
            row['talkPage_categories'] = WP_en_article_talk_obj.categories
        
        except wikipedia.exceptions.DisambiguationError as e:
            logger.warning (f"{index:>5}. '{WP_en_article_talk_obj.title}' is an Ambiguous title: {e.options}")
            logger.warning (f"{index:>5}. Picking the first sense in the list by default: {e.options[0]}")
            WP_en_article_obj = wikipedia.WikipediaPage(e.options[0])
            WP_en_article_HTML = WP_en_article_obj.html()
            
            #checking for redirects
            if e.options[0].casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
                logger.warning (f"{index:>5}. Redirected to: {WP_en_article_obj.title}")
            
            logger.warning (f"{index:>5}. Getting the talkPage_categories for: {WP_en_article_obj.title}")
            WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
            row['talkPage_categories'] = WP_en_article_talk_obj.categories

        except wikipedia.exceptions.PageError as e:
            logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a PageError - reason: {e} - Article has no talk page yet")
        except KeyError as e:
            logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a KeyError  - reason: {e} - Article's Talk page has no Category")
            
        finally:
            if console_handler.level==10: 
                display(row['talkPage_categories'])
            else:
                logger.debug(row['talkPage_categories'])
            
        logger.debug(f"{index:>5}. Extracting task forces, class, and importance")
        taskForces=[]
        for cat in row['talkPage_categories'] or []:
            if cat.casefold().endswith("-class medicine articles"):
                row['class'] = cat[0:-24]
            if cat.casefold().endswith('-importance medicine articles'):
                imp = {
                    'NA'      : None,
                    '???'     : None,
                    'Unknown' : None,
                    'Low'     : '4-Low',
                    'Mid'     : '3-Mid',
                    'High'    : '2-High',
                    'Top'     : '1-Top'
                    }
                if imp[cat[0:-29]]:
                    row['importance'] = imp[cat[0:-29]]
                else:
                    row['importance'] = None
                
            if cat.casefold().endswith(' task force articles') and "wikiproject" not in cat.casefold():
                taskForce = cat[0:-20]
                taskForces.append(taskForce)
            
        row['taskForces']=taskForces
        if console_handler.level<=20:
            display(HTML(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}"))
        else:
            logger.info(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}")



        matches=0
        PIMD_paper_dict={}
        logger.info(f"{index:>5}. searching Pubmed for WkD_diseaseLabel: {row['WkD_diseaseLabel']}") 
        id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)
        if len(id_list)==200:
            logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Too many matches found (>200)") 
            logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
            time.sleep(1)
            id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,False)
            if len(id_list)==0:
                logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Restrcting search to Titles only returned no results")
                logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
                time.sleep(1)
                id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)                      
        
        if not id_list and row['WkD_diseaseLabel'].lower()!= re.sub(r" ?\([^)]+\)", "", WP_en_article_Title.replace("_", " ").lower()):
            logger.warning(f"{index:>5}. (a) searching for '{row['WkD_diseaseLabel']}' returned {len(id_list)} results")
            logger.warning(f"{index:>5}. (b) searching for '{WP_en_article_Title.replace('_', ' ')}' instead")
            time.sleep(1)
            id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)
            if len(id_list)==200:
                logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Retruned too many matches (>200)")
                logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
                time.sleep(1)
                id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,False)
                if len(id_list)==0:
                    logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Restrcting search to Titles only returned no results")
                    logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
                    time.sleep(1)
                    id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)                      
        
        logger.info(f"{index:>5}. {len(id_list)} matching PMIDs found.")
        if  id_list:
            papers = fetch_details(index,id_list)
            #print(json.dumps(papers, indent=5))

            for i, paper in enumerate(papers['PubmedArticle']):
                articleTitle = paper['MedlineCitation']['Article']['ArticleTitle']
                PMID = int(paper['MedlineCitation']['PMID'])
                cited = searh_wp_refs_4PMID(str(PMID),WP_en_article_HTML)
                if cited:
                    matches +=1
                #display(articleTitle,PMID,cited)
                latestVersion=None                       
                PIMD_paper_dict[int(PMID)]=[articleTitle,cited,latestVersion]
                
        PIMD_paper_dict = OrderedDict(sorted(PIMD_paper_dict.items(), key=lambda t: [str(title).lower() for title in t[1][0]]))
        #display(PIMD_paper_dict)
        row['cochrane_reviews']= PIMD_paper_dict

        if PIMD_paper_dict:
            row['cochrane_reviews_html']='<div align="left" style="margin:0px;"><ol start="1" style="margin-left:0px">'
            bgc="White"
            for PMID, paper in PIMD_paper_dict.items():
                cited=paper[1]
                if cited:
                    color="green"
                    cited_message="<b> [CITED] </b>"
                if not cited:
                    color="red"
                    cited_message="<b> [NOT CITED] </b>"
                
                title=paper[0]
                latestVersion=True
                foundItems = (key for key, vals in PIMD_paper_dict.items() if title.lower() in [str(val).lower() for val in vals] and key!=PMID)
                for item in foundItems:
                    #display(item)
                    if item > PMID:
                        latestVersion=False
                paper[2]=latestVersion
                #display(title,PMID,cited,latestVersion)
                
                if latestVersion:
                    version_message="<b> [LATEST Version] </b>"
                else:
                    version_message="<b> [OLD Version] </b>"
                    if cited:
                        color="orange"
                        version_message +="<b> [UPDATE NEEDED] </b>"
                    else:
                        color="grey"
                        
                if latestVersion:
                    if bgc=="white":
                        bgc="#E0F5FE"
                    else:
                        bgc="white"
                
                row['cochrane_reviews_html']+='<li style="padding:5px;color:'+color+';background-color:'+bgc+';">'\
                    +paper[0]+' <a target="_blank" href="https://www.ncbi.nlm.nih.gov/pubmed/'\
                    +str(PMID)+'">PMID: '+str(PMID)+'</a>'+cited_message+version_message+"</li>"

            row['cochrane_reviews_html']+="</ol></div>"
        else:
            row['cochrane_reviews_html']="No matching publication found!"

        
        if console_handler.level<=10: #10=DEBUG
            display(HTML(row['cochrane_reviews_html']))
        else:
            #logger.debug(h2t.handle(str(row['cochrane_reviews_html'])))
            logger.debug(row['cochrane_reviews_html'])            

        citation_search_log.value = f'<p style="color:green;"><b>processed</b>: disease #{index:>5} &emsp; \
            {row["WkD_diseaseLabel"]} &emsp; {row["WkD_disease"]} &emsp; {row["WP_en_article"]}</p> \
            <p style="color:green;"><b>{matches} of {len(id_list)}</b>\
            Cochrane reviews found (via PubMed) are cited in the Wikipedia article: {WP_en_article_Title}</p>'
        
        if console_handler.level<=20:
            display(HTML(f"<b>END of row {index:>5}<b><hr>"))
        else:
            logger.info(f"------------------------------------------------{index:>5}><hr>")

HTML(value='')

HTML(value='')

HBox(children=(IntProgress(value=0, description='Progress', max=4381, style=ProgressStyle(description_width='i…

KeyboardInterrupt: 

In [None]:
from IPython.display import Markdown
import qgrid

from  itertools import chain
from collections import Counter

display(Markdown("# Statistical Summaries:"))
display(Markdown(f"### Total number of diseases identified in Wikidata: {df.shape[0]}"))
display(Markdown(f"### Total number of diseases identified in Wikidata which have a corresponding article in Wikipedia:\
                  {df[df['WP_en_article'].notnull()].shape[0]}"))
display(Markdown("-----------------------------------------------------"))                  

display(Markdown("### Wikipedia articles class:"))
display(df.loc[df['WP_en_article'].notnull()]['class'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
        rename(columns = {'class': 'Total count','index': 'Article quality class'}).\
        sort_values(by = 'Total count', ascending = False).style.hide_index())

display(Markdown("### Wikipedia articles importance:"))
display(df.loc[df['WP_en_article'].notnull()]['importance'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
        rename(columns = {'importance': 'Total count','index': 'Article quality category'}).\
        sort_values(by = 'Total count', ascending = True).style.hide_index())


display(Markdown('### Wikipedia articles taskForces:'))
#display(pd.Series(Counter(chain.from_iterable( df.loc[df['taskForces'].notnull()]['taskForces'] ) )))
forces_count = {'NA':0}
for taskForces in df.loc[df['WP_en_article'].notnull()]['taskForces']:
    if taskForces:
        for taskForce in taskForces:
            if taskForce in forces_count:
                forces_count[taskForce] += 1
            else :
              forces_count[taskForce] = 1
    else:
        forces_count['NA'] += 1
display(pd.DataFrame(forces_count.items(), columns = ['taskForce', 'Total count']).style.hide_index())
display(Markdown("-----------------------------------------------------")) 

                
display(Markdown(f"### Number of disease related Wikipedia articles which cite one or more Cochrane reviews: \
                 {df[df['cochrane_reviews'].notnull()].shape[0]}"))
display(Markdown(f"### Number of disease related Wikipedia articles which do not cite any  Cochrane reviews: \
                 { df[df['cochrane_reviews'].isnull()].shape[0] - df[df['WP_en_article'].isnull()].shape[0] }"))
display(Markdown("-----------------------------------------------------")) 

                 
uniquePMIDs_count={}
for PIMD_paper_dict in df['cochrane_reviews']:
    if PIMD_paper_dict:
        for PMID, paper in PIMD_paper_dict.items():
            cited=paper[1]
            latestVersion=paper[2]
            if latestVersion:
                if PMID in uniquePMIDs_count:
                   uniquePMIDs_count[PMID] = [uniquePMIDs_count[PMID][0]+1, uniquePMIDs_count[PMID][1]+ int(cited), int(latestVersion)]
                else:
                   uniquePMIDs_count[PMID]=[1,int(cited),int(latestVersion)]     
                 
display(Markdown(f'### PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
        .rename(columns = {'index': 'PMIDs of Cochrane reviews (Latest versions only)',0: 'Number of times appeared in search results'\
                        ,1: 'Number of times cited in Wikipedia',2: 'Latest version? (1=Yes, 0=No)'}).\
                 sort_values(by = 'Number of times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A',PMIDs_df['Number of times appeared in search results'].sum()\
                         ,PMIDs_df['Number of times cited in Wikipedia'].sum(),PMIDs_df['Latest version? (1=Yes, 0=No)'].sum()]           
display(PMIDs_df)             
display(Markdown("-----------------------------------------------------"))
                 
uniquePMIDs_count={}
for PIMD_paper_dict in df['cochrane_reviews']:
    if PIMD_paper_dict:
        for PMID, paper in PIMD_paper_dict.items():
            cited=paper[1]
            latestVersion=paper[2]
            if not latestVersion:
                if PMID in uniquePMIDs_count:
                   uniquePMIDs_count[PMID] = [uniquePMIDs_count[PMID][0]+1, uniquePMIDs_count[PMID][1]+ int(cited), int(latestVersion)]
                else:
                   uniquePMIDs_count[PMID]=[1,int(cited),int(latestVersion)]     
                 
display(Markdown(f'### PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
        .rename(columns = {'index': 'PMIDs of Cochrane reviews (Old versions only)',0: 'Number of times appeared in search results'\
                        ,1: 'Number of times cited in Wikipedia',2: 'Latest version? (1=Yes, 0=No)'}).\
        sort_values(by = 'Number of times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A',PMIDs_df['Number of times appeared in search results'].sum()\
                         ,PMIDs_df['Number of times cited in Wikipedia'].sum(),PMIDs_df['Latest version? (1=Yes, 0=No)'].sum()]           
display(PMIDs_df)             
display(Markdown("-----------------------------------------------------"))                    

In [None]:
# import qgrid

# qgrid.show_grid(df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class','importance','taskForces','cochrane_reviews_html']].head(100),
#                grid_options={'rowHeight': 500})


# def on_button_clicked(b):
#     display(df.style.format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles}))
# button = widgets.Button(description="Show Results")
# button.on_click(on_button_clicked)
# display(button)

# #--------------------------------------------------------------------------------
# """
# This is a modified version of code from: 
# 1. https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel
# """    
# from IPython.display import HTML
# import pandas as pd
# import numpy as np
# import base64

# def create_download_link(df, title = "<b>Download Results in CSV Format<b>", filename = "data.csv"):
#     """
#     function that takes in a dataframe and creates a text link to  
#     download it (will only work for files < 2MB or so)
#     """  
#     csv = df.to_csv()
#     b64 = base64.b64encode(csv.encode())
#     payload = b64.decode()
#     html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
#     html = html.format(payload=payload,title=title,filename=filename)
#     return HTML(html)

# display(create_download_link(df))

#--------------------------------------------------------------------------------
def make_clickable_wkd_items(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val[31:])

def make_clickable_taskForces(val):
    # target _blank to open new window
    if not val: 
        return None
    else:
        html=""
        for taskForce in val:
            html += f'<p><a target="_blank" href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/{taskForce.replace(" ","_")}_task_force">{taskForce}</a></p>' 
    return html

def make_clickable_WP_en_articles(val):
    # target _blank to open new window
    if not val: 
        return None
    else:
        return '<a target="_blank" href="{}">{}</a>'.format(val, unquote(unquote(val[30:].replace("_", " "))))
    
def list2text(list_):
    txt=""
    for item in list_:
        txt += item + ", "
    return txt[:-2]

from IPython.display import FileLink, FileLinks, HTML
import html2text

h = html2text.HTML2Text()
h.ignore_links = False

df_plainText =df.copy()
df_plainText['cochrane_reviews_html'] = [h.handle(text) if text is not None else  text for text in df['cochrane_reviews_html']]
df_plainText['taskForces'] = [list2text(text) if text is not None else  text for text in df['taskForces']]

df_plainText.to_csv('./results/data.csv', index=False)
display(HTML('<b>Download tabular results in CSV or HTML format:<b>'),FileLinks('./results/',recursive=False),HTML('<hr/>'))


th    = dict(selector="th",           props=[('text-align', 'left'),('font','blod 14px arial, sans-serif'),('vertical-align','top')])
rh    = dict(selector=".row_heading", props=[("text-align", "left"),('font','bold 14px arial, sans-serif'),('vertical-align','top')])

col0 = dict(selector=".col0", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','100px'),('vertical-align','top')])
col1 = dict(selector=".col1", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','130px'),('vertical-align','top')])
col2 = dict(selector=".col2", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','110px'),('vertical-align','top'),('word-wrap','break-word')])
col3 = dict(selector=".col3", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','60px'),('vertical-align','top'),('word-wrap','break-word')])
col4 = dict(selector=".col4", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])
col5 = dict(selector=".col5", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])

col6 = dict(selector=".col6", props=[("text-align", "left"),('font','12px arial, sans-serif'),('vertical-align','top')])

# for  index,row in df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class','importance','taskForces','cochrane_reviews_html']]\
#         .head(10).rename_axis('index')\
#         .sort_values(by=['importance'],na_position='last').iterrows():
#     display(HTML(f"<h2>WikiData disease: {row['WkD_diseaseLabel']} {make_clickable_wkd_items(row['WkD_disease'])}</h2>"))
#     display(HTML(f"<h2>Corresponding Wikipedia article: {make_clickable_WP_en_articles(row['WP_en_article'])}</h2>"))
#     display(HTML(row['cochrane_reviews_html']))

tabular_data=df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class','importance','taskForces','cochrane_reviews_html']]\
        .head(10).rename_axis('index')\
        .sort_values(by=['importance'],na_position='last')\
        .style.set_table_styles([th,rh,col0,col1,col2,col3,col4,col5,col6])\
        .format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles,
                 'taskForces': make_clickable_taskForces})

f=open("./results/data.html","w")
f.write(tabular_data.render()) # df is the styled dataframe
f.close()


display(tabular_data)




In [None]:
# from wikidata.client import Client
# client = Client()  # doctest: +SKIP
# entity = client.get('Q1472', load=True)
# print (entity)
# print (entity.description)

# image_prop = client.get('P18')
# image = entity[image_prop]
# print (image)

# print(image.image_resolution)

# print(image.image_url)

