# Web Based Data Project Python Script

### Importing Libraries and connecting to the server

In [108]:
from tqdm import tqdm
import time
from nltk.stem.porter import *
from nltk.corpus import stopwords
import string
import re
import simple_icd_10 as icd
import urllib
from nltk.stem import WordNetLemmatizer

In [2]:
#https://stackoverflow.com/questions/13652230/cant-get-entrez-to-return-mesh-terms-using-biopython
#pip install biopython
from Bio import Entrez
from Bio.Entrez import efetch, read
Entrez.email = "arthur.hughes27@outlook.com" 

In [3]:
from pymed import PubMed
import pandas as pd
#import scispacy
#import spacy
#from spacy import displacy
#from collections import Counter
#import en_core_web_sm
#from nltk import word_tokenize
import csv
pubmed = PubMed(email="arthur.hughes27@outlook.com")

### Find PubMed articles with keyword "cardiovascular disease" and the available molecules

In [6]:
query = "cardiovascular disease, air pollution[TITLE]"
results= pubmed.query(query, max_results=10)
#save 50 results 
articleList= []
for article in results:   #for loop to collect articles
  articleDict = article.toDict()
  articleList.append(articleDict)

In [13]:
df= pd.DataFrame(articleList) #save to dataframe
df.head(10)[['pubmed_id', 'title']]

Unnamed: 0,pubmed_id,title
0,36498110,Air Pollution: Possible Interaction between th...
1,36494401,Ecological study of ambient air pollution expo...
2,36462796,Is telomere length in buccal or salivary cells...
3,36458358,Estimating the burden of disease attributable ...
4,36456430,Air Pollution and Cardiovascular Disease Burde...
5,36446272,Long-term exposure to traffic-related air poll...
6,36434404\n33309175\n15298232\n33657686\n255987...,Air Pollution and the Heart: Updated Evidence ...
7,36416472,Does long-term exposure to air pollution suppr...
8,36413929,Association between household air pollution fr...
9,36409413,Association between household air pollution an...


## retrieve links from pubmed papers using query

### helper function

In [37]:
def mesh_helper(pmid):
    # call PubMed API
    handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
    xml_data = read(handle)['PubmedArticle'][0]

    # skip articles without MeSH terms
    if u'MeshHeadingList' in xml_data['MedlineCitation']:
        for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']:
            # grab descriptor name
            name = mesh['DescriptorName'].title()
            descr = [mesh['DescriptorName']]
            # grab descriptor id
            mesh_id = list(descr[0].attributes.items())[0][1]
            major = list(descr[0].attributes.items())[1][1]

            yield(name, mesh_id, major)

            
def get_mesh(pmid_lst, link_lst):
    dlst = {}
    l = 0
    for pmid in tqdm(pmid_lst):
        time.sleep(0.3)
        dlst_key = pmid[0:8]
        helper = mesh_helper(pmid)
        dlst_val = {}
        for name, mesh_id, major in helper:
            if major == "Y":
                dlst_val[mesh_id] = name
        dlst_val['link'] = link_lst[l]
        dlst[dlst_key]=dlst_val
        l += 1
                
    return dlst

def get_link_from_abstract(abstract_lst):
    res_lst = []
    for abstract in abstract_lst:
        # 1. lowercase everything
        text = abstract.lower()
        stemmer = PorterStemmer()
        # sentence by sentence
        lines = text.split(". ")
        # store sentence
        res = []
        for l in lines:
            orig_l = l
            # 2. Removing punctuation
            l = l.translate(str.maketrans('', '', string.punctuation))
            # 3. stemming
            l = [stemmer.stem(x) for x in l.split()]   
            l =' '.join(l)
            # 4. association word
            yes = r"(associ|relat|caus|lead|increas|decreas|result|show|link|affect)"
            # 5. something we dont want
            no = r"(investig|can|object|now|recent|whether|worldwid)"
            if re.findall(yes, l) != [] and re.findall(no, l) == []:
                res.append(orig_l)
        res_lst.append(res)
    return res_lst

### main function

In [None]:
def query_to_link(query, max_num):
    try:
        results= pubmed.query(query, max_results=max_num)    
        articleList= []
        for article in results:
          articleDict = article.toDict()
          articleList.append(articleDict)

        df= pd.DataFrame(articleList) 
        pmid_lst = df.pubmed_id
        link_lst = get_link_from_abstract(df.abstract)
        
        return get_mesh(pmid_lst, link_lst)
    except:
        print("No papers were found. Please modify your query.")

In [121]:
## example
query_to_link("cardiovascular disease, air pollution[TITLE]", 10)

100%|███████████████████████████████████████████| 10/10 [00:11<00:00,  1.20s/it]


{'36498110': {'link': ['air pollution exposure already starts in utero before birth, potentially causing delayed chronic diseases arising later in life',
   'there are, indeed, time windows during the life of individuals who are more susceptible to air pollution exposure, which may result in more severe outcomes']},
 '36494401': {'link': ['our finding demonstrated that cardiovascular diseases in elderly males and females in iran had a general decreasing trend (aapc\u2009=\u2009-0.77% and -0.65%, respectively)',
   'the results showed a positive correlation between exposure to ambient ozone pollution (p\u2009≤\u20090.001, r\u2009=\u20090.94) ambient particulate and air pollution (p\u2009<\u20090.001, r\u2009=\u20090.99) and mortality of cardiovascular disease',
   'evidence from this study indicated that ambient air pollution, directly and indirectly, affects cardiovascular disease mortality in two ways by increasing the prevalence of some traditional cardiovascular disease risk factors

## Validate Predicted Links thru CTD

### helper function

In [116]:
def pollutant_ctd_disease(pollutant, num):
    try:
        if pollutant == "O3":
            pollutant_mesh_id = "D010126"
        if pollutant.startswith("PM"):
            pollutant_mesh_id = "D052638"
        if pollutant == "NO2":
            pollutant_mesh_id = "D009585"
        if pollutant == "NO":
            pollutant_mesh_id = "D009569"
        if pollutant == "SO2":
            pollutant_mesh_id = "D013458"
        if pollutant == "CO":
            pollutant_mesh_id = "D002248"
        if pollutant == "NOx":
            pollutant_mesh_id = "D009589"
    

        base = "http://ctdbase.org/detail.go?acc="
        # we look into cardiovascular disease only; inferenece are sorted in the order of inference score
        tail = "&view=disease&slimTerm=Cardiovascular+disease&assnType=all&sort=networkScore&6578706f7274=1&type=chem&dir=asc&d-1332398-e=5"
        url =  base+pollutant_mesh_id+tail
        res = pd.read_table(url)[['Disease Name', 'Disease ID', 'Inference Score']]
        if num <= len(res.index):
            return(res.head(num))
        else:
            print("The requested number of output exceeded the maximum. The maximum number of output can be returned is {}.".format(len(res.index)))
        
    except:
        print("Make sure the input is one of (O3, PM0.1, PM2.5, PM10, NO, NO2, SO2, CO, NOx)")

def map_ctd(ctd_disease_lst, pred_disease):
    res = []
    stemmer = PorterStemmer()
    pred_disease = pred_disease.translate(str.maketrans('', '', string.punctuation))
    pred_disease = [stemmer.stem(x) for x in pred_disease.split()]
    
    for d in ctd_disease_lst:
        ctd_d = d.lower()
        ctd_d = ctd_d.translate(str.maketrans('', '', string.punctuation))
        ctd_d = ctd_d.split()
        stop_words = ["diseas"]
        
        ctd_d = [stemmer.stem(x) for x in ctd_d if stemmer.stem(x) not in stop_words ]
       
        for t in ctd_d:
            if t in pred_disease:
                res.append(d)
                break
    return res


In [None]:
## example
pollutant_ctd_disease("O3", 10)

### main function

In [117]:
def pred_to_ctd(pred_csv, num=10):
    # pred_csv is the csv file name of your prediction from TransE
    # get_parents: return the parent of each disease
    pred = open(pred_csv, 'r')
    pred = csv.reader(pred, delimiter = '\t')
    res = []
    for row in pred:
        time.sleep(0.3)
        pollutant = row[0]
        icd_code = row[1]
        disease = icd.get_description(icd_code).lower()
        
        ctd = pollutant_ctd_disease(pollutant, num)
        
        ctd_res = map_ctd(ctd['Disease Name'], disease)
        
        if ctd_res != []:
            res.append([pollutant, icd_code, disease, ";".join(ctd_res)])
        else:
            res.append([pollutant, icd_code, disease, "no match"])

    res = pd.DataFrame(res, columns=["Pollutant", "ICD 10 Code", "Disease", "CTD"])
    return res

In [118]:
pred_res = pred_to_ctd('res.csv', 10)
pred_res

Unnamed: 0,Pollutant,ICD 10 Code,Disease,CTD
0,PM2.5,I51.4,"myocarditis, unspecified",no match
1,O3,I31.9,"disease of pericardium, unspecified",no match
2,NO,I47.2,ventricular tachycardia,no match
3,NO,I51.4,"myocarditis, unspecified",no match
4,O3,I50.0,congestive heart failure,"Heart Failure, Diastolic;Heart Failure;Heart V..."
5,PM2.5,I47.2,ventricular tachycardia,no match
6,NOx,I44.3,other and unspecified atrioventricular block,no match
7,O3,I50.9,"heart failure, unspecified","Heart Failure, Diastolic;Heart Failure;Heart V..."
8,NO,I70.9,generalized and unspecified atherosclerosis,Atherosclerosis
9,NO2,I44.3,other and unspecified atrioventricular block,no match


## retrive links for those unmatched prediction from last step

In [131]:
un_match = pred_res.loc[ pred_res["CTD"] == "no match"]
un_match

Unnamed: 0,Pollutant,ICD 10 Code,Disease,CTD
0,PM2.5,I51.4,"myocarditis, unspecified",no match
1,O3,I31.9,"disease of pericardium, unspecified",no match
2,NO,I47.2,ventricular tachycardia,no match
3,NO,I51.4,"myocarditis, unspecified",no match
5,PM2.5,I47.2,ventricular tachycardia,no match
6,NOx,I44.3,other and unspecified atrioventricular block,no match
9,NO2,I44.3,other and unspecified atrioventricular block,no match
11,NOx,I48,atrial fibrillation and flutter,no match
14,O3,I95.9,"hypotension, unspecified",no match
15,NO,I74.4,embolism and thrombosis of arteries of extremi...,no match


In [145]:
def pred_to_link(pollutant, disease, num=10):
    disease = disease.translate(str.maketrans('', '', string.punctuation))
    stopword = stopwords.words("english")
    stopword.extend(["unspecified"])
    disease = [x for x in disease.split() if x not in stopword]
    disease = ' '.join(disease)
    query = pollutant + " " + disease
    return query_to_link(query, num)

In [146]:
pred_to_link("O3", "other and unspecified atrioventricular block" )

100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.21s/it]


{'22138703': {'D006339': 'Heart Rate',
  'D019570': 'Inhalation Exposure',
  'link': ['a separate cohort was tested for vulnerability to aconitine-induced arrhythmia 24 hr after exposure.\nexposure to 0.8 ppm o₃ caused bradycardia, pr prolongation, st depression, and substantial increases in atrial premature beats, sinoatrial block, and atrioventricular block, accompanied by concurrent increases in several hr variability parameters that were suggestive of increased parasympathetic tone',
   'however, both 0.2 and 0.8 ppm o₃ increased sensitivity to aconitine-induced arrhythmia formation, suggesting a latent o₃-induced alteration in myocardial excitability.\no₃ exposure causes several alterations in cardiac electrophysiology that are likely mediated by modulation of autonomic input to the heart',
   'moreover, exposure to low o₃ concentrations may cause subclinical effects that manifest only when triggered by a stressor, suggesting that the adverse health effects of ambient levels of ai

In [148]:
stemmer.stem("myocardial")

'myocardi'

### NLP - Find potential links

In [None]:
key_sentences=[]
search_keywords=['associated']
for abstract in df.abstract:
    if abstract is not None:
        sentences = abstract.split(".")
        for sentence in sentences:
            if sum(1 for word in search_keywords if word in sentence)>0:
                key_sentences.append(sentence.lower())

In [None]:
potential_links=[]
search_keywords=['NO2','O3','particulate','pollutant','air pollution','SO2','CO2','PM2.5','PM10','CO','Carbon','Oxygen']
for element in key_sentences:
    if sum(1 for word in search_keywords if word in element)>0:
        potential_links.append(element)
df2=pd.DataFrame(potential_links)
df2.to_csv('sentences.csv',index=False)

In [None]:
df2