In [None]:
# Look at the PubMed references in GenCC data

'''The url to the latest version of the data is here - https://search.thegencc.org/download/action/submissions-export-tsv'''

#download the data using the urllib library
import urllib.request

url = 'https://search.thegencc.org/download/action/submissions-export-tsv'
filename = 'gencc_data.tsv'

urllib.request.urlretrieve(url, filename)

In [None]:
#read the data into a pandas dataframe *you can see from the url that the data is tab separated*
import pandas as pd

df = pd.read_csv(filename, sep='\t')

#print the size of the data
print(df.shape)

#look at the first few rows of the data
df.head()

In [None]:
# select the gene_symbol and submitted_as_assertion_criteria_url columns
# the latter contains references such as PubMed IDs
genccGene2PubMed = df[['gene_symbol', 'submitted_as_pmids']]

#rename columns as gene_symbol and pubmed_id
genccGene2PubMed.columns = ['gene_symbol', 'pubmed_id']

#drop any rows with missing values
genccGene2PubMed = genccGene2PubMed.dropna()

# remove duplicates
genccGene2PubMed = genccGene2PubMed.drop_duplicates()

# replace any whitespace from the pubmed_id column entries with an empty string
genccGene2PubMed['pubmed_id'] = genccGene2PubMed['pubmed_id'].str.replace(' ', '')

#reindex
genccGene2PubMed.reset_index(drop=True, inplace=True)

#look at the first few rows
genccGene2PubMed.head(50)

In [79]:
# for each gene find the earliest year of publication
import urllib.request
import xml.etree.ElementTree as ET

# load my API key from the file
with open('../../api_keys/ncbi.txt', 'r') as file:
    api_key = file.read().strip()

with open('../../api_keys/ncbi_email.txt', 'r') as file:
    email = file.read().strip()

def get_first_pubmed_year(pubmed_ids):
    pubmed_ids = pubmed_ids.split(',')
    #convert the list of pubmed_ids to a string with [uid] following each id and then join them with 'AND'
    pubmed_ids_query = ' OR '.join([f'{pubmed_id}[pmid]' for pubmed_id in pubmed_ids])

    # Define the parameters for the eSearch request
    esearch_params = {
        'db': 'pubmed',
        'term': pubmed_ids_query,
        'api_key': api_key,
        'email': email,
        'usehistory': 'y'
    }

    # encode the parameters so they can be passed to the API
    encoded_data = urllib.parse.urlencode(esearch_params).encode('utf-8')

    # the base request url for eSearch
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

    # make the request
    request = urllib.request.Request(url, data=encoded_data)
    response = urllib.request.urlopen(request)

    # read into an XML object
    esaerch_data_XML = ET.fromstring(response.read())

    # Extract WebEnv and QueryKey
    webenv = esaerch_data_XML.find('WebEnv').text
    query_key = esaerch_data_XML.find('QueryKey').text

    efetch_params = {
    'db': 'pubmed',
    'query_key': query_key,
    'WebEnv': webenv,
    'api_key': api_key,
    'email': email
    }

    # encode the parameters so they can be passed to the API
    encoded_data = urllib.parse.urlencode(efetch_params).encode('utf-8')

    # the base request url for eSummary
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    # make the request
    request = urllib.request.Request(url, data=encoded_data)
    response = urllib.request.urlopen(request)

    # read into an XML object
    esummary_data_XML = ET.fromstring(response.read())
    # tree = ET.ElementTree(esummary_data_XML)
    # ET.indent(tree, space="   ")
    # pretty_str = ET.tostring(esummary_data_XML, encoding="unicode")
    # print(pretty_str)

    # list to store the years of publication
    years = []

    # Extract the year of publication for each pubmed_id
    for article in esummary_data_XML.findall('PubmedArticle'):
        # print(ET.tostring(article))
        for date in article.iter('PubDate'):
            year = date.find('Year').text
            years.append(int(year))

    #return the lowest year
    return min(years)

#pass the pubmed_ids from genccGene2PubMed to the function for the first 10 genes
for i in range(10):
    gene = genccGene2PubMed.loc[i, 'gene_symbol']
    pubmed_ids = genccGene2PubMed.loc[i, 'pubmed_id']
    try:
        year = get_first_pubmed_year(pubmed_ids)
        print(f'{year} - {gene}')
    except:
        print(f'{gene} - Error')


#NB this is again a slightly contrived example to show the eUtils in action, there are faster ways to do this by pulling all PMIDs at once and mapping across to genes
# You could try this!


2015 - A2ML1
2009 - BRAF
2009 - CBL
2005 - HRAS
2011 - KRAS
2015 - LZTR1
2011 - MAP2K1
2011 - MAP2K2
2017 - MRAS
2003 - NF1
