# Web Based Data Project Python Script

### Importing Libraries and connecting to the server

In [312]:
from tqdm import tqdm
import time

In [6]:
#https://stackoverflow.com/questions/13652230/cant-get-entrez-to-return-mesh-terms-using-biopython
#pip install biopython
from Bio import Entrez
from Bio.Entrez import efetch, read
Entrez.email = "arthur.hughes27@outlook.com" 

In [1]:
from pymed import PubMed
import pandas as pd
import scispacy
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from nltk import word_tokenize
import csv
pubmed = PubMed(email="arthur.hughes27@outlook.com")

### Find PubMed articles with keyword "cardiovascular disease" and the available molecules

In [283]:
query = "cardiovascular disease, air pollution[TITLE]"
results= pubmed.query(query, max_results=10)
#save 50 results 
articleList= []
for article in results:   #for loop to collect articles
  articleDict = article.toDict()
  articleList.append(articleDict)

In [366]:
df= pd.DataFrame(articleList) #save to dataframe
#df.head()
df.conclusions

0                                                 None
1                                                 None
2    The burden of disease from HAP due to cooking ...
3    At the state level, APACVD burden decreased as...
4    The available literature provides low to moder...
5                                                 None
6    Based on our findings, although physical fitne...
7                                                 None
8                                                 None
9                                                 None
Name: conclusions, dtype: object

## get MeSh term from PubMed papers with MajorTopicYN = 'Y'

In [363]:
def mesh_helper(pmid):
    # call PubMed API
    handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
    xml_data = read(handle)['PubmedArticle'][0]

    # skip articles without MeSH terms
    if u'MeshHeadingList' in xml_data['MedlineCitation']:
        for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']:
            # grab descriptor name
            name = mesh['DescriptorName'].title()
            descr = [mesh['DescriptorName']]
            # grab descriptor id
            mesh_id = list(descr[0].attributes.items())[0][1]
            major = list(descr[0].attributes.items())[1][1]

            yield(name, mesh_id, major)

            
def get_mesh(pmid_lst):
    dlst = {}
  #  l = 0
    for pmid in tqdm(pmid_lst):
        time.sleep(0.3)
        if len(pmid)==8:
            dlst_key = pmid
            helper = mesh_helper(pmid)
            dlst_val = {}
            for name, mesh_id, major in helper:
                if major == "Y":
                    dlst_val[mesh_id] = name
            if dlst_val != {}:
             #   dlst_val['conclusion'] = conclusion_lst[l]
                dlst[dlst_key]=dlst_val
      #  l += 1
                
    return dlst

def query_to_mesh(query, max_num):
    results= pubmed.query(query, max_results=max_num)
    articleList= []
    for article in results:
      articleDict = article.toDict()
      articleList.append(articleDict)
    
    df= pd.DataFrame(articleList) 
    pmid_lst = df.pubmed_id
  #  conclusion_lst = df.conclusions

    return get_mesh(pmid_lst)

In [367]:
query_to_mesh("ozone, disease[TITLE]", 30)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:21<00:00,  1.40it/s]


{'36458351': {'D010126': 'Ozone', 'D000397': 'Air Pollution'},
 '36265970': {'D000393': 'Air Pollutants',
  'D001249': 'Asthma',
  'D010126': 'Ozone'},
 '36113365': {'D059630': 'Mesenchymal Stem Cells',
  'D010126': 'Ozone',
  'D029424': 'Pulmonary Disease, Chronic Obstructive'},
 '36089140': {'D000544': 'Alzheimer Disease',
  'D004785': 'Environmental Pollutants',
  'D010126': 'Ozone'},
 '36004603': {'D000393': 'Air Pollutants',
  'D000397': 'Air Pollution',
  'D002318': 'Cardiovascular Diseases'},
 '35914398': {'D000393': 'Air Pollutants',
  'D017202': 'Myocardial Ischemia',
  'D010126': 'Ozone',
  'D052638': 'Particulate Matter'},
 '35803371': {'D000393': 'Air Pollutants',
  'D000397': 'Air Pollution',
  'D010126': 'Ozone',
  'D051436': 'Renal Insufficiency, Chronic'},
 '35780850': {'D000393': 'Air Pollutants',
  'D000397': 'Air Pollution',
  'D003324': 'Coronary Artery Disease',
  'D010126': 'Ozone'},
 '35544998': {'D000393': 'Air Pollutants',
  'D000397': 'Air Pollution',
  'D0012

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:18<00:00,  1.07it/s]


{'36458358': {'D003296': 'Cooking', 'D000397': 'Air Pollution'},
 '36458351': {'D010126': 'Ozone', 'D000397': 'Air Pollution'},
 '36447194': {'D000544': 'Alzheimer Disease'}}

### NLP - Find potential links

In [145]:
key_sentences=[]
search_keywords=['associated']
for abstract in df.abstract:
    if abstract is not None:
        sentences = abstract.split(".")
        for sentence in sentences:
            if sum(1 for word in search_keywords if word in sentence)>0:
                key_sentences.append(sentence.lower())

In [146]:
potential_links=[]
search_keywords=['NO2','O3','particulate','pollutant','air pollution','SO2','CO2','PM2.5','PM10','CO','Carbon','Oxygen']
for element in key_sentences:
    if sum(1 for word in search_keywords if word in element)>0:
        potential_links.append(element)
df2=pd.DataFrame(potential_links)
df2.to_csv('sentences.csv',index=False)

In [147]:
df2

Unnamed: 0,0
0,"in particular, air pollution was associated w..."
1,\nan increased exposure to any class of air po...
2,long-term exposure to air pollution has been a...
3,\nto investigate changes in serum metabolites ...
4,27 μg/m\nour study suggested that long-term ex...
...,...
784,"in the elderly, a 3-4% increase in daily deat..."
785,"in addition, it seems that older age groups a..."
786,air pollution was associated with each of the...
787,"in general, the gases, particularly co, but n..."
