In [19]:
import numpy

In [2]:
import requests
from bs4 import BeautifulSoup
import html2text
import mygene
import json
import pickle
mg = mygene.MyGeneInfo()

In [3]:
parts_to_remove = [
    "##  Summary\n",
    "NEW",
    'Try the newGene table',
    'Try the newTranscript table',
    '**',
    "\nGo to the top of the page Help\n"
]

def rough_text_from_gene_name(gene_number):
    
    # get url
    url = f"https://www.ncbi.nlm.nih.gov/gene/{gene_number}"
    # Send a GET request to the URL
    summary_text = ''
    soup = None
    try:
        response = requests.get(url, timeout=30)
    except requests.exceptions.Timeout:
        print('time out')
        return((summary_text,soup))
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the "summary" tab content by inspecting the page's structure
        summary_tab = soup.find('div', {'class': 'rprt-section gene-summary'})

        # Check if the "summary" tab content is found
        if summary_tab:
            # Convert the HTML to plain text using html2text
            html_to_text = html2text.HTML2Text()
            html_to_text.ignore_links = True  # Ignore hyperlinks

            # Extract the plain text from the "summary" tab
            summary_text = html_to_text.handle(str(summary_tab))
            # Remove the specified parts from the original text
            for part in parts_to_remove:
                summary_text = summary_text.replace(part, ' ')
                # Replace '\n' with a space
            summary_text = summary_text.replace('\n', ' ')

            # Reduce multiple spaces into one space
            summary_text = ' '.join(summary_text.split())
            # Print or save the extracted text
        else:
            print("Summary tab not found on the page.")
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    return((summary_text,soup))

In [4]:
# Using Gene CD24 as an example
cd_24_name = mg.querymany('CD24', scopes='symbol', species='human')

querying 1-1...done.
Finished.


In [42]:
cd_24_name

[{'query': 'CD24',
  '_id': '100133941',
  '_score': 19.221214,
  'entrezgene': '100133941',
  'name': 'CD24 molecule',
  'symbol': 'CD24',
  'taxid': 9606}]

In [5]:
gene_name_to_tax_id = {}
for result in cd_24_name:
    if "_id" in result and "query" in result:
        gene_name_to_tax_id[result['symbol']] = result['_id']

In [6]:
gene_name_to_tax_id

{'CD24': '100133941'}

In [8]:
with open('vocab.json', 'rb') as handle:
    vocab_gene = json.load(handle)
vocab_gene_list = list(vocab_gene.keys())

In [9]:
gene_name_to_summary_page = {}

In [10]:
for gene_name, page_id in sorted(gene_name_to_tax_id.items()):
    if gene_name not in gene_name_to_summary_page:
        print('gene_name',gene_name)
        parsed_text, unparsed_html = rough_text_from_gene_name(page_id)
        gene_name_to_summary_page[gene_name] = parsed_text

gene_name CD24


In [11]:
gene_name_to_summary_page

{'CD24': 'Official Symbol CD24provided by HGNC Official Full Name CD24 moleculeprovided by HGNC Primary source HGNC:HGNC:1645 See related Ensembl:ENSG00000272398 MIM:600074; AllianceGenome:HGNC:1645 Gene type protein coding RefSeq status REVIEWED Organism Homo sapiens Lineage Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo Also known as CD24A Summary This gene encodes a sialoglycoprotein that is expressed on mature granulocytes and B cells and modulates growth and differentiation signals to these cells. The precursor protein is cleaved to a short 32 amino acid mature peptide which is anchored via a glycosyl phosphatidylinositol (GPI) link to the cell surface. This gene was missing from previous genome assemblies, but is properly located on chromosome 6. Non-transcribed pseudogenes have been designated on chromosomes 1, 15, 20, and Y. Alternative splicing results in multiple transc

### Experiment with Varied Gene Sets Tailored to Your Needs
#### For any specific gene name, mygene can be utilized to translate it into page IDs. We've illustrated an example using gene vocabularies in scGPT and Geneformer. Download links for these files are available in the repository's README.

In [20]:
# load genes used in GenePT
with open(f"vocab.json", 'rb') as handle:
    vocab_gene = json.load(handle)
vocab_gene_list = list(vocab_gene.keys())

# load genes used in Geneformer
with open(f"token_dictionary.pkl", 'rb') as handle:
    token_dictionary = pickle.load(handle)

# example query to convert gene IDs into page ids for NCBI 
vocab_gene_list_results = mg.querymany(sorted(vocab_gene_list), scopes='symbol', species='human')
token_dictionary_results = mg.querymany(sorted(token_dictionary.keys()), fields="symbol")

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [24]:
token_dictionary_results[10]

{'query': 'ENSG00000001084',
 '_id': '2729',
 '_score': 26.072834,
 'symbol': 'GCLC'}

In [37]:
vocab_gene_list

['RP5-973N23.5',
 'RP11-182N22.10',
 'CTB-53D8.3',
 'RP11-348N17.2',
 'RP11-205M20.8',
 'RP11-326C3.17',
 'RP11-439H13.3',
 'RP11-413H22.3',
 'GET1-SH3BGR',
 'CH17-476P10.1',
 'LLNLR-271E8.1',
 'CTD-2527I21.16',
 'CTC-359D24.5',
 'RP5-967N21.13',
 'CTC-264K15.5',
 'RP11-314O13.2',
 'RP11-127I20.9',
 'RP11-473M10.4',
 'OR7E11P_ENSG00000285537',
 'RP11-867O8.9',
 'TMX2-CTNND1',
 'RP11-732A19.10',
 'RP11-545E17.13',
 'RP11-1C8.8',
 'RP11-346I3.6',
 'GS1-273L24.6',
 'AL135749.6',
 'RP11-641J8.4',
 'RP11-478H11.3-001',
 'RP5-1111F22.1',
 'CTB-161K23.4',
 'RNF216P1_ENSG00000288620',
 'RP11-350J20.15',
 'XXbac-BPG254B15.11',
 'PDCD6-AHRR',
 'RP11-115L11.3',
 'RP11-553D4.3',
 'RP11-1020A11.3',
 'RP11-316O14.3',
 'AC012488.3',
 'RP13-511M20.1',
 'RP11-179A20.1',
 'RP4-797M17.2',
 'RP11-422P24.15',
 'CH17-159N18.5',
 'RP11-91A18.6',
 'RP11-426C22.12',
 'RP11-470L19.7',
 'LA16c-407A10.3',
 'DUX4L8',
 'DUX4L3',
 'RP4-568B10.1',
 'RP11-583F2.7',
 'RP11-147K16.3',
 'AP000892.6',
 'CTD-2001J20.1',
 '

In [41]:
vocab_gene_list_results[0].keys()

dict_keys(['query', 'notfound'])

In [29]:
gene_name_to_tax_id_all = {}
for result in vocab_gene_list_results:
    if "_id" in result and "query" in result:
        gene_name_to_tax_id_all[result['symbol']] = result['_id']

In [31]:
len(gene_name_to_tax_id_all) #seems too many 

38850

In [35]:
gene_name_to_tax_id_all

{'A1BG': '1',
 'A1BG-AS1': '503538',
 'A1CF': '29974',
 'A2M': '2',
 'A2M-AS1': '144571',
 'A2ML1': '144568',
 'A2ML1-AS1': '100874108',
 'A2ML1-AS2': '106478979',
 'A2MP1': 'ENSG00000256069',
 'A3GALT2': '127550',
 'A4GALT': '53947',
 'A4GNT': '51146',
 'AA06': '100506677',
 'AAAS': '8086',
 'AACS': '65985',
 'AACSP1': 'ENSG00000250420',
 'AADAC': '13',
 'AADACL2': '344752',
 'AADACL2-AS1': '101928142',
 'AADACL3': '126767',
 'AADACL4': '343066',
 'AADACP1': 'ENSG00000240602',
 'AADAT': '51166',
 'AAGAB': '79719',
 'AAK1': '22848',
 'AAMDC': '28971',
 'AAMP': '14',
 'AANAT': '15',
 'AAR2': '25980',
 'AARD': '441376',
 'AARS1': '16',
 'AARS1P1': '106480683',
 'AARS2': '57505',
 'AARSD1': '80755',
 'AARSD1P1': '117981788',
 'AASDH': '132949',
 'AASDHPPT': '60496',
 'AASS': '10157',
 'AATBC': '284837',
 'AATF': '26574',
 'AATK': '9625',
 'ABALON': '103021294',
 'ABAT': '18',
 'ABCA1': '19',
 'ABCA10': '10349',
 'ABCA11P': '79963',
 'ABCA12': '26154',
 'ABCA13': '154664',
 'ABCA17P': '650

In [43]:
gene_name_to_summary_page_all = {}

In [47]:
from tqdm import tqdm

ModuleNotFoundError: No module named 'tqdm'

In [48]:
len(gene_name_to_summary_page_all)

90

In [49]:
from tqdm import tqdm

In [46]:
!pip install tqdm 



In [73]:
gene_name_to_summary_page_all = {}

In [74]:
for gene_name, page_id in tqdm(sorted(gene_name_to_tax_id_all.items())):
    if gene_name not in gene_name_to_summary_page_all:
        #print('gene_name', gene_name)
        parsed_text, unparsed_html = rough_text_from_gene_name(page_id)
        gene_name_to_summary_page_all[gene_name] = parsed_text

  0%|                                      | 9/38850 [00:18<17:28:51,  1.62s/it]

Failed to retrieve the webpage. Status code: 404


  0%|                                     | 16/38850 [00:31<17:23:11,  1.61s/it]

Failed to retrieve the webpage. Status code: 404


  0%|                                     | 22/38850 [00:43<18:46:36,  1.74s/it]

Failed to retrieve the webpage. Status code: 404


  0%|                                     | 75/38850 [02:47<23:13:26,  2.16s/it]

Failed to retrieve the webpage. Status code: 404


  0%|                                     | 82/38850 [03:04<25:09:51,  2.34s/it]

Failed to retrieve the webpage. Status code: 404


  0%|                                     | 83/38850 [03:05<20:22:15,  1.89s/it]

Failed to retrieve the webpage. Status code: 404


  1%|▎                                   | 365/38850 [13:49<17:41:53,  1.66s/it]

Failed to retrieve the webpage. Status code: 404


  1%|▎                                   | 381/38850 [14:21<18:36:48,  1.74s/it]

Failed to retrieve the webpage. Status code: 404


  1%|▍                                   | 432/38850 [16:43<22:52:55,  2.14s/it]

Failed to retrieve the webpage. Status code: 404


  2%|▊                                   | 846/38850 [31:18<15:44:16,  1.49s/it]

Failed to retrieve the webpage. Status code: 404


  2%|▊                                   | 851/38850 [31:26<14:49:18,  1.40s/it]

Failed to retrieve the webpage. Status code: 404


  3%|▉                                  | 1003/38850 [37:35<16:47:12,  1.60s/it]

Failed to retrieve the webpage. Status code: 404


  3%|▉                                  | 1040/38850 [38:47<24:50:02,  2.36s/it]

Failed to retrieve the webpage. Status code: 404


  3%|▉                                  | 1104/38850 [40:50<12:50:35,  1.22s/it]

Failed to retrieve the webpage. Status code: 404


  3%|█                                  | 1135/38850 [41:53<21:18:48,  2.03s/it]

Failed to retrieve the webpage. Status code: 404


  3%|█                                  | 1202/38850 [44:28<23:13:10,  2.22s/it]


KeyboardInterrupt: 

In [67]:
len(gene_name_to_summary_page_all)

152

In [65]:
import pickle

In [68]:
with open('small_gene_name_to_summary_page.pickle', 'wb') as handle:
    pickle.dump(gene_name_to_summary_page_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [69]:
with open('small_gene_name_to_summary_page.pickle', 'rb') as handle:
    b = pickle.load(handle)


In [71]:
len(b)

152

### Look at just 3K mye genes

In [91]:
mye_gene = np.loadtxt("mye_genes.txt", dtype=str)
len(mye_gene)

3000

In [92]:
mye_to_tax_id = {g: gene_name_to_tax_id_all[g] for g in mye_gene 
                 if g in gene_name_to_tax_id_all.keys()}
len(mye_to_tax_id)

2707

In [93]:
mye_to_tax_id

{'HES4': '57801',
 'ISG15': '9636',
 'AGRN': '375790',
 'TNFRSF18': '8784',
 'TNFRSF4': '7293',
 'CDK11B': '984',
 'NADK': '65220',
 'SKI': '6497',
 'ACOT7': '11332',
 'TNFRSF25': '8718',
 'KLHL21': '9903',
 'PER3': '8863',
 'UTS2': '10911',
 'TNFRSF9': '3604',
 'ERRFI1': '54206',
 'RERE': '473',
 'SLC2A5': '6518',
 'GPR157': '80045',
 'SPSB1': '80176',
 'RBP7': '116362',
 'KIF1B': '23095',
 'AGTRAP': '57085',
 'MTHFR': '4524',
 'MIIP': '60672',
 'TNFRSF1B': '7133',
 'DHRS3': '9249',
 'PRDM2': '7799',
 'TMEM51': '55092',
 'CASP9': '842',
 'PLEKHM2': '23207',
 'SLC25A34': '284723',
 'NBPF1': '55672',
 'ATP13A2': '23400',
 'PADI2': '11240',
 'PADI4': '23569',
 'ARHGEF10L': '55160',
 'UBR4': '23352',
 'NBL1': '4681',
 'CDA': '978',
 'ECE1': '1889',
 'C1QA': '712',
 'C1QC': '714',
 'C1QB': '713',
 'ID3': '3399',
 'FUCA1': '2517',
 'RUNX3': '864',
 'STMN1': '3925',
 'PDIK1L': '149420',
 'ZNF593': '51042',
 'SH3BGRL3': '83442',
 'CD52': '1043',
 'HMGN2': '3151',
 'SLC9A1': '6548',
 'SYTL1': 

In [94]:
mye_gene_to_summary = {}

In [95]:
gene_name_to_summary_page_all

{'A1BG': 'Official Symbol A1BGprovided by HGNC Official Full Name alpha-1-B glycoproteinprovided by HGNC Primary source HGNC:HGNC:5 See related Ensembl:ENSG00000121410 MIM:138670; AllianceGenome:HGNC:5 Gene type protein coding RefSeq status REVIEWED Organism Homo sapiens Lineage Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo Also known as A1B; ABG; GAB; HYST2477 Summary The protein encoded by this gene is a plasma glycoprotein of unknown function. The protein shows sequence similarity to the variable regions of some immunoglobulin supergene family member proteins. [provided by RefSeq, Jul 2008] Orthologs mouse all',
 'A1BG-AS1': 'Official Symbol A1BG-AS1provided by HGNC Official Full Name A1BG antisense RNA 1provided by HGNC Primary source HGNC:HGNC:37133 See related Ensembl:ENSG00000268895 AllianceGenome:HGNC:37133 Gene type ncRNA RefSeq status PREDICTED Organism Homo sapiens Li

In [96]:
for gene_name, page_id in tqdm(sorted(mye_to_tax_id.items())):
    if gene_name not in mye_gene_to_summary:
        #print('gene_name', gene_name)
        parsed_text, unparsed_html = rough_text_from_gene_name(page_id)
        mye_gene_to_summary[gene_name] = parsed_text

 91%|█████████████████████████████████▋   | 2463/2707 [1:45:02<10:23,  2.56s/it]

Failed to retrieve the webpage. Status code: 404


100%|█████████████████████████████████████| 2707/2707 [1:56:15<00:00,  2.58s/it]


In [97]:
with open('mye_gene_name_to_summary_page.pickle', 'wb') as handle:
    pickle.dump(mye_gene_to_summary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [99]:
len(mye_gene_to_summary)

2707