In [None]:
from Bio import Entrez, Medline
import pickle
import pandas as pd
import numpy as np

In [None]:
def pmids_to_medline_file(file_name, pmid_list, email, api_key): 
    
    # this function uses a list of pmids to return medline files from NCBI using epost and efetch
    # the output is a file to parse using medline parser
    
    # The PMID list may be provided directly or it may have be retrieved by using a pubmed search. 
    # if the pubmed search was used then the date_name can be parsed from the results_d in the previous function
    # other wise, the PMID list has be provided from another source and we can need to make a new date_name
    
    # set the entrez variable set up for the group/project
    Entrez.email = email
    Entrez.tool = "genepopi_search_developer"
    Entrez.api_key = api_key    


    # post a joined list of the new pmids to the NCBI history server and save the search results.
    # NCBI's history server allows you to post once and then iteratively retrieve records without reposting
    # it also works better than sending a long URL (full of the pmids) which risks breaking

    # to do this we need to record the webenv and query_key to use in out e-fetch request
    
    # remove all duplicate pmids
    new_pmids = list(set(pmid_list))
    search_results = Entrez.read(Entrez.epost(db="pubmed", id=",".join(new_pmids)))

    web_env = search_results['WebEnv']
    query_key = search_results['QueryKey']

    # if the total count is greated than the max retieval then we will need to retrieve in batches.
    t_count = len(new_pmids)
    # 500 is the max batch size
    batch_size = 500

    # set the file name to store the medline records, i am using the date searched but you can change the name variable above to whatever you like
    out_handle = open(f"medline_files/{file_name}.txt", "a+")

    # now lets use an efetch loop to retrieve medline records from our pmid list 
    # the start will be set by jumping from 0 to the final counts, in increments of the batch size
    for start in range(0,t_count,batch_size):
        # set the end number of retieval to be the smallest out of the total or start plus batch number of 
        end = min(t_count, start+batch_size)
        # give some feedback on the process
        print(f"Going to download record {start+1} to {end} out of {t_count} for search: {file_name}")
        # occasional server errors should be expected, this try:except block will allow 3 attempts to download each batch
        attempt = 0
        while attempt <= 3:
            attempt += 1
            try:
                # send a request to efetch pubmed db in the medline format, setting the start and end record, according to the pmid post on the history server 
                fetch_handle = Entrez.efetch(db="pubmed",rettype="medline",
                                             retmode="text",retstart=start,
                                             retmax=batch_size,
                                             webenv=web_env,
                                             query_key=query_key)
                attempt = 4
            # the except block will occur when there has been an error
            except:
                pass
        # the data read in from the respons is then written to the outhandle until the loop is complete and the handle is then closed.
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
    print(f'Job Complete, output file path = medline_files/{file_name}.txt')

In [None]:
# set your email
email = 'email'
# add your NCBI api_key - **** register your email and generate an api key here https://www.ncbi.nlm.nih.gov/account/settings/
api_key = 'api_key'

# set the name of the output text file
file_name = 'medline'


# get the pmid list to search
pmid_list = list(set('previously_defined_list_of_pmids'))
print(f'The input file has {len(pmid_list)} pmids to get the metadata for')

pmids_to_medline_file(file_name, pmid_list, email, api_key)

In [None]:
# now we can read in the medline file to create a dataframe

In [None]:
import uuid
import pickle
import pandas as pd
import re

def get_medline_doi(record):
    # the record we refer to is a bio.medline.record, effectively a python dictionary
    # getting a doi is valuable and we need to try a bit harder to find one
    doi = None
    # look and see if the LID field is present in the record                        
    LID = record.get('LID')
    if LID:
        # iterate through the LID looking for 'doi'
        for val in LID:
            if 'doi' in val:
                # split the string to get rid of ' [doi]' and keep the plain doi string
                doi = val.split()[0]
    # if that didn't work then look in the AID Field and do the same
    if doi == None:
        AID = record.get("AID")
        if AID:
            for val in AID:
                if 'doi' in val:
                    doi = val.split()[0]
    # sometimes the doi is only present in the SO section and needs to be parsed out of the citation
    if doi == None:
        SO = record.get('SO')
        if SO:
            if 'doi' in SO:
                # now we need to remove all the surround text and full stop from the end   
                # use regular expression to locate the "doi: xkkxjwkdjdfksfd" section and cut it out as a string
                doi = re.findall(r'doi: \S+', SO)
                # remove 'doi' from the string
                doi = doi[0].replace('doi: ', '')
                # check to see if there is a full stop at the end of the string (doi's dont end in full stops)
                if doi[-1] == '.':
                    doi = doi[:-1] 
    # now save whatever we got back, otherwise save the default None            
    return doi

def creation_retrieved_df(medline_file_name):
    # as input we provide the name of the medline file for parsing out all the individual records
    # now lets run the parser for all the new records 
    # as each record is parsed, we give it a unique index (hexidecimal string)
    # each medline record is then written to file(named the same as the unique index)so that we can find the metadata easily if we want to parse out other fields
    # the medline records are then added to a dictionary which will become the basic retrieved_df.
    
    # set the input file
    in_file = medline_file_name
    
    # our main output will be a dictionary ready to be converted into a retrieved df
    parse_d = {}

    #  read in the text file using Medline Parser from biopython
    with open(in_file, 'r') as handle:
        # biopython provides a medline parser so that each record is imported as a dictionary to extract from
        records = Medline.parse(handle)

        # loop through each record creating a set of the most important variables
        for record in records:
            # create a hexidecimal unique id for the record
            index = str(uuid.uuid4().hex)


            # we use the get() function for a dictionary to search each field.
            # if the field is populated add the value, else, add 'None'
            pmid = record.get('PMID')
            pmcid = record.get('PMC')
            title = record.get('TI')
            abstract = record.get('AB')
            authors = record.get('AU')
            journal_title = record.get('JT')
            pub_type = record.get('PT')
            issn = record.get('IS')
            gene = record.get('GS')
            mesh = record.get('MH')
            comment_on = record.get('CON')
            erratum_for = record.get('EFR')
            correct_repub = record.get('CRF')

            # use our function above to get a datetime obj for the pdat provided (or None)                               
            dt_pdat = record.get('DP')
            
            # now get the doi using the function above          
            doi = get_medline_doi(record)

            # now we save all the variables to the parse dictionary
            parse_d.update({index:{'pmid': pmid,
                                'pmcid':pmcid,
                                'title': title,
                                'abstract': abstract,
                                'authors':authors,
                                'journal':journal_title,
                                'pub_type':pub_type,
                                'pub_date':dt_pdat,
                                'doi': doi,
                                'issn':issn,
                                   'gene_meta':gene,
                                   'mesh':mesh,
                                   'comment_on':comment_on,
                                   'erratum_for':erratum_for,
                                   'correct_repub':correct_repub}})

    pm_df = pd.DataFrame.from_dict(parse_d, orient= 'index')

    print('Process Complete')
    return pm_df 

In [None]:
# lets build our metadata dataframe
meta_df = creation_retrieved_df('medline_files/medline.txt')
# now write the dataframe to file
meta_df.to_csv('medline_files/meta_df.tsv', sep = '\t', header = True, index = True)

In [None]:
pickle.dump(meta_df,open('metadata_df.p','wb'))