In [None]:
from Bio import Entrez, Medline
import datetime
import pickle
import pandas as pd
import numpy as np

# we can use the search terms provided to query pubmed using entrez esearch. 
# This will provide us with a list of pmids to retrieve in medline format
# this function takes the search terms in the query string and returns a dictionary 
# the dict keys are: the date of serach, the query string used, the total count returned and the list of pmids 
def search_terms_to_pmid_list(query_string, email, api_key):
    # set the entrez variables which are set up in advance for each group/project
    Entrez.email = email
    Entrez.tool = "genepopi_search_developer"
    Entrez.api_key = api_key    
    # get todays date 
    date = datetime.datetime.today()
    date = f'{date.year}_{date.month}_{date.day}_{date.hour}_{date.minute}'
    # send the query string to entrez esearch with the retmax at 90,000
    # note if the number of returned records is > retmax we will need to iterate over the search results using a moving start and end point.
    search_results = Entrez.read(Entrez.esearch(db="pubmed", term = query_string, retmax=90000))
    # We can then save the total count returned and pmid list to new variables
    t_count = int(search_results['Count'])
    pmids = list(search_results['IdList'])
    
    # construct the output dict
    results_d = {'date':date, 'search_term':query_string, 'total_count':t_count, 'pmids':pmids}
    
    # check if we need to batch the pmid list due to limit on retreival set to 90000
    if t_count > 90000:
        print("Your search query has returned more than 90,000 results\nWe will need to batch the pmid retrieval")
        
        start = len(pmids)
        while start != t_count:
            start+=1
            
            search_results = Entrez.read(Entrez.esearch(db="pubmed", term = query_string, retmax=90000, retstart = start))
            # We can then save the total count returned and pmid list to new variables
            pmids = list(search_results['IdList'])
        
            # update the output dict to add on the new pmids
            previous = results_d['pmids']
            previous.extend(pmids)
                        
            results_d.update({'pmids':previous})
            
            start += len(pmids)
        
    
    
    # save the output dictionary for our records of what terms used and number of records returned for a given date.
    pickle.dump(results_d, open(f'./{date}.p', 'wb'))

    return results_d

In [None]:
# set your email
email = 'email_address'
# add your NCBI api_key - **** register your email and generate an api key here https://www.ncbi.nlm.nih.gov/account/settings/
api_key = 'api_key'

In [None]:
ddg2p_df = pd.read_csv('path_to_ddg2p.csv)
ddg2p_genes = list(set(ddg2p_df['gene symbol']))

## gene symbol

In [None]:
genes = ddg2p_genes.copy()

In [None]:
# we can store the output from the search as a dictionary then dataframe
gene_d = {}
for gene in genes:
    results_d = search_terms_to_pmid_list(gene, email, api_key)
    gene_d.update({gene:{'count':results_d['total_count'], 'pmids':results_d['pmids']}})
    
    # now lets make each gene's pmids as a text string to file
    pmids_str = (',').join(results_d['pmids'])
    with open(f'{gene}_pmids.txt', 'w+') as file:
        file.write(pmids_str)

In [None]:
# now build the df to look at the counts and pmids quickly
gene_df = pd.DataFrame.from_dict(gene_d, orient = 'index')
pickle.dump(gene_df, open('gene_df.p', 'wb'))
gene_df.head()

## gene symbol[ti]

In [None]:
genes_ti = [f'{i}[TI]' for i in genes]

In [None]:
# we can store the output from the search as a dictionary then dataframe
gene_ti_d = {}
for gene in genes_ti:
    results_d = search_terms_to_pmid_list(gene, email, api_key)
    gene_ti_d.update({gene:{'count':results_d['total_count'], 'pmids':results_d['pmids']}})
    
#     # now lets make each gene's pmids as a text string to file
    pmids_str = (',').join(results_d['pmids'])
    with open(f'{gene}_ti_pmids.txt', 'w+') as file:
        file.write(pmids_str)

In [None]:
# now build the df to look at the counts and pmids quickly

gene_ti_df = pd.DataFrame.from_dict(gene_ti_d, orient = 'index')
pickle.dump(gene_ti_df, open('gene_ti_df.p', 'wb'))

In [None]:
# set the cadmus pmids and save as a string.
all_pmids = []
for pmids in gene_ti_df['pmids']:
    all_pmids.extend(pmids)
print(f'there are {len(all_pmids)} pmids found from our {len(gene_ti_df)} gene search')

cadmus_pmids = list(set(all_pmids))
print(f'There are {len(cadmus_pmids)} unique pmids to get metadata for')

with open('cadmus_ti_pmids.txt', 'w+') as file:
    file.write(','.join(cadmus_pmids))


## gene symbol [tiab]

In [None]:
genes_tiab = [f'{i}[TIAB]' for i in genes]  

# we can store the output from the search as a dictionrary then dataframe
gene_tiab_d = {}
for gene in genes_tiab:
    results_d = search_terms_to_pmid_list(gene, email, api_key)
    gene_tiab_d.update({gene:{'count':results_d['total_count'], 'pmids':results_d['pmids']}})
    
    # now lets make each gene's pmids as a text string to file
    pmids_str = (',').join(results_d['pmids'])
    with open(f'{gene}_tiab_pmids.txt', 'w+') as file:
        file.write(pmids_str)

In [None]:
# now build the df to look at the counts and pmids quickly

gene_tiab_df = pd.DataFrame.from_dict(gene_tiab_d, orient = 'index')
pickle.dump(gene_tiab_df, open('gene_tiab_df.p', 'wb'))
gene_tiab_df.sort_values(by='count',ascending=False).head(20)

In [None]:
# set the cadmus pmids and save as a string.
all_pmids = []
for pmids in gene_tiab_df['pmids']:
    all_pmids.extend(pmids)
print(f'there are {len(all_pmids)} pmids found from our {len(gene_tiab_df)} gene search')

cadmus_pmids = list(set(all_pmids))
print(f'There are {len(cadmus_pmids)} unique pmids to get metadata for')

with open('cadmus_tiab_pmids.txt', 'w+') as file:
    file.write(','.join(cadmus_pmids))

## gene[mesh] gene symbol[tiab]

In [None]:
genes_mesh_tiab = [f'gene[MESH] {i}[TIAB]' for i in genes]  

# we can store the output from the search as a dictionrary then dataframe
gene_mesh_tiab_d = {}
for gene in genes_mesh_tiab:
    results_d = search_terms_to_pmid_list(gene, email, api_key)
    gene_mesh_tiab_d.update({gene:{'count':results_d['total_count'], 'pmids':results_d['pmids']}})
    
    # now lets make each gene's pmids as a text string to file
    pmids_str = (',').join(results_d['pmids'])
    with open(f'{gene}_mesh_tiab_pmids.txt', 'w+') as file:
        file.write(pmids_str)

In [None]:
# now build the df to look at the counts and pmids quickly

gene_mesh_tiab_df = pd.DataFrame.from_dict(gene_mesh_tiab_d, orient = 'index')
pickle.dump(gene_mesh_tiab_df, open('gene_tiab_df.p', 'wb'))
gene_mesh_tiab_df.sort_values(by='count',ascending=False).head(20)

In [None]:
# set the cadmus pmids and save as a string.
all_pmids = []
for pmids in gene_mesh_tiab_df['pmids']:
    all_pmids.extend(pmids)
print(f'there are {len(all_pmids)} pmids found from our {len(gene_mesh_tiab_df)} gene search')

cadmus_pmids = list(set(all_pmids))
print(f'There are {len(cadmus_pmids)} unique pmids to get metadata for')

with open('cadmus_mesh_tiab_pmids.txt', 'w+') as file:
    file.write(','.join(cadmus_pmids))