# Expanding collision acronyms
One problem for Aim 1 in Anastasia's dissertation is that once a gene-alias collision pair has been identified, we have to determine what the collision symbol actually represents in the context of the parent symbol. For example, CAP is listed as an alias for BRD4 but this symbol collides with so many other CAP aliases. In the context of BRD4, CAP actually refers to 'chromosome associated protein'. What cap stands for differs across different parent gene symbols. While this can be manually curated for a small set of genes, there exist over 100,000 gene-alias pairs to consider and so a programmatic approach will be needed. Additionally, a separate but related problem will be to programmatically identify the type of collision(?).

## Scratch Testing

In [102]:
import requests
from tqdm.notebook import tqdm
i = 0

for uniprot in tqdm(gene_symbols['UniProt ID']):
    r = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot}')
    if r.status_code == 200:
        try:
            symbol = r.json()['genes'][0]['geneName']['value']
        except:
            symbol = 'Error'
        # print(r.json()['genes'][0]['geneName']['value'])
    else:
        symbol = None
    gene_symbols['symbol'][i] = symbol
    i += 1

gene_symbols


NameError: name 'gene_symbols' is not defined

In [None]:
import requests
uniprot = 'Q53GG5'

r = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot}?format=json')

r.json()

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'Q53GG5',
 'secondaryAccessions': ['B2R866',
  'O43590',
  'O60439',
  'O60440',
  'Q8N6Y6',
  'Q9BVP4'],
 'uniProtkbId': 'PDLI3_HUMAN',
 'entryAudit': {'firstPublicDate': '2005-12-06',
  'lastAnnotationUpdateDate': '2024-07-24',
  'lastSequenceUpdateDate': '2005-05-24',
  'entryVersion': 164,
  'sequenceVersion': 1},
 'annotationScore': 5.0,
 'organism': {'scientificName': 'Homo sapiens',
  'commonName': 'Human',
  'taxonId': 9606,
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'proteinExistence': '1: Evidence at protein level',
 'proteinDescription': {'recommendedName': {'fullName': {'value': 'PDZ and LIM domain protein 3'}},
  'alternativeNames': [{'fullName': {'value': 'Actinin-associated LIM protein'}},
   {'fullName': {'v

In [None]:
# ['references']['id'] Abstract searching for text?
# ['alternativeNames']['fullName'] Regex matching text?

In [None]:
# For gene symbol:
#   Grab Uniprot ID (from gene normalizer?)
# Query UniProt
#   For AlternativeName: 
#       Grab FullName
#           Regex/Rule Match 1 -- Length of symbol + First letter of terms check
#           Regex/Rule Match 2 -- First letter of terms check, but include n-1 examples
#           Regex/Rule Match n -- ???
#               if potentialMatch:
#                   possible_expansions.append(match)
#   df['gene symbol'][position] = possible_expansions
#
#   For References['ID']:
#       Query Pubmed
#           Grab Abstract
#           Create n-length pairs of words (n = length of symbol)
#           For n-length pairs of words:
#               Regex/Rule Match 1 -- Length of Symbol + First letter of terms check
#               Regex/Rule Match 2 -- First letter of terms check, but include n-1 examples
#               Regex/Rule Match 3 -- ???
#                   if potentialMatch:
#                       possible_expansions.append(match)
#   df['gene symbol'][position] = possible_expansions


### Grab Uniprot ID from Gene Normalizer

In [None]:
import requests

gene = 'BRD4' # CAP
query = f'https://normalize.cancervariants.org/gene/normalize?q={gene}'
r = requests.get(query)
uniprot_id = r.json()['gene_descriptor']['extensions'][2]['value'][6] # make this a for, check for associated_with
uniprot_id = uniprot_id.replace('uniprot:','')
uniprot_id

r.json()['gene_descriptor']['extensions']

[{'type': 'Extension', 'name': 'symbol_status', 'value': 'approved'},
 {'type': 'Extension',
  'name': 'approved_name',
  'value': 'bromodomain containing 4'},
 {'type': 'Extension',
  'name': 'associated_with',
  'value': ['ucsc:uc002nar.4',
   'vega:OTTHUMG00000183252',
   'iuphar:1945',
   'orphanet:449868',
   'refseq:NM_058243',
   'cosmic:BRD4',
   'uniprot:O60885',
   'ccds:CCDS46004',
   'ccds:CCDS82307',
   'ena.embl:Y12059',
   'omim:608749',
   'pubmed:10938129',
   'ccds:CCDS12328']},
 {'type': 'Extension', 'name': 'previous_symbols', 'value': ['LOC90616']},
 {'type': 'Extension', 'name': 'strand', 'value': '-'},
 {'type': 'Extension',
  'name': 'hgnc_locations',
  'value': [{'_id': 'ga4gh:VCL.OnjZ5UPScFNAI_iMhJVyrD9JQ70HmeJ0',
    'type': 'ChromosomeLocation',
    'species_id': 'taxonomy:9606',
    'chr': '19',
    'interval': {'type': 'CytobandInterval',
     'start': 'p13.12',
     'end': 'p13.12'}}]},
 {'type': 'Extension',
  'name': 'ensembl_locations',
  'value': [{'_

In [None]:
for extension in r.json()['gene_descriptor']['extensions']:
        if extension['name'] == 'associated_with':
            uniprot_id = [value for value in extension['value'] if value.startswith('uniprot:')]
            print(uniprot_id)
            pass

['uniprot:O60885']


### Query UniProt for Data

#### Check Alternative Names

In [None]:
r = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot_id}?format=json')
expanded_name = r.json()['proteinDescription']['alternativeNames'][0]['fullName']['value'] 
expanded_name # No Match

'Protein HUNK1'

#### Check PMIDs

In [None]:
pmids_to_check = []
for pmid in r.json()['references']:
    pmids_to_check.append(pmid['citation']['id'])


pmids_to_check

['11733348',
 'CI-DMQH005K2JO7F',
 '15057824',
 'CI-5GBDQ6B103N1E',
 '15489334',
 '12543779',
 '16109376',
 '16109377',
 '17081983',
 '16940503',
 '17189189',
 '18922874',
 '18669648',
 '19413330',
 '19596240',
 '19103749',
 '19608861',
 '20068231',
 '21269460',
 '21555454',
 '21890894',
 '21406692',
 '22334664',
 '23086925',
 '22509028',
 '24360279',
 '23186163',
 '23317504',
 '23589332',
 '23728299',
 '24275569',
 '25218447',
 '25114211',
 '25772364',
 '25593309',
 '25755297',
 '28112733',
 '29374058',
 '31969702',
 '17344846',
 '18500820',
 '20871596',
 '21068722',
 '21568322',
 '21964340',
 '22137933',
 '22645123',
 '22136404',
 '23095041',
 '22464331',
 '23517011',
 '23530754',
 '29176719',
 '32193360',
 '29379197',
 '29440723',
 '31168063',
 '35470444']

In [None]:
# Thanks senor GPT
def get_pubmed_abstract(pmid, api_key='YOUR_API_KEY'):
    """ Fetch the abstract for a given PMID from PubMed. """
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pmid,
        'retmode': 'xml'
        # 'api_key': api_key
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        # Parse the XML response
        from xml.etree import ElementTree
        root = ElementTree.fromstring(response.text)
        
        # Find the AbstractText element and return its text
        abstract_text = root.find('.//AbstractText')
        if abstract_text is not None:
            return abstract_text.text
        else:
            return "No abstract found."
    else:
        return f"Error: {response.status_code}"



In [None]:
import time
from tqdm import tqdm

abstracts_to_check = []
for pmid in tqdm(pmids_to_check):
    entry = {}
    if pmid.startswith('CI'):
        continue
    else:
        time.sleep(1)
        abstract = get_pubmed_abstract(pmid)
        entry[pmid] = abstract
        abstracts_to_check.append(entry)
        
len(abstracts_to_check)

100%|██████████| 58/58 [01:14<00:00,  1.28s/it]


56

In [None]:
abstracts_to_check

[{'11733348': 'Translocation t(15;19)(q13;p13.1) defines a lethal midline carcinoma arising adjacent to respiratory tract in young people. To characterize molecular alterations responsible for the distinctly aggressive biological behavior of this cancer, we mapped the chromosome 15 and 19 translocation breakpoints by fluorescence in situ hybridization (FISH) and Southern blotting. To evaluate preliminarily the frequency, anatomical distribution, and histological features of t(15;19) cancer, we developed a FISH assay for paraffin sections. Our findings reveal a novel oncogenic mechanism in which the chromosome 19 translocation breakpoint interrupts the coding sequence of a bromodomain gene, BRD4. These studies implicate BRD4 as a potential partner in a t(15;19)-associated fusion oncogene. In addition, we localized the chromosome 15 breakpoint to a 9-kb region in each of two cases, thereby identifying several candidate oncogenes which might represent the BRD4 fusion partner. FISH evaluat

In [None]:
possible_matches = []
gene = 'CAP'
for abstract_dict in abstracts_to_check:
    keys = list(abstract_dict.keys())
    possible_match = {}
    for key in keys:
        result = find_matching_groups(abstract_dict[key],gene)
        possible_match[key] = result
    possible_matches.append(possible_match)

possible_matches
        

[{'11733348': []},
 {'15057824': []},
 {'15489334': []},
 {'12543779': []},
 {'16109376': []},
 {'16109377': ['complex, a potential']},
 {'17081983': []},
 {'16940503': []},
 {'17189189': []},
 {'18922874': []},
 {'18669648': []},
 {'19413330': []},
 {'19596240': []},
 {'19103749': []},
 {'19608861': []},
 {'20068231': []},
 {'21269460': ['Composition and properties']},
 {'21555454': []},
 {'21890894': []},
 {'21406692': []},
 {'22334664': []},
 {'23086925': []},
 {'22509028': []},
 {'24360279': []},
 {'23186163': []},
 {'23317504': []},
 {'23589332': []},
 {'23728299': []},
 {'24275569': []},
 {'25218447': []},
 {'25114211': []},
 {'25772364': []},
 {'25593309': []},
 {'25755297': []},
 {'28112733': []},
 {'29374058': []},
 {'31969702': []},
 {'17344846': []},
 {'18500820': []},
 {'20871596': []},
 {'21068722': []},
 {'21568322': []},
 {'21964340': []},
 {'22137933': []},
 {'22645123': []},
 {'22136404': []},
 {'23095041': []},
 {'22464331': []},
 {'23517011': []},
 {'23530754': []},


In [None]:
def find_matching_groups_og(abstract, gene_symbol):
    abstract = abstract.replace('-', ' ')
    # Split the abstract into individual words
    words = abstract.split()
    # Determine the length of the gene symbol to set the size of each word group
    gene_length = len(gene_symbol)
    # Initialize a list to store matching groups
    matching_groups = []

    # Loop through the words to form groups of size gene_length
    for i in range(len(words) - gene_length + 1):
        # Select the current group of words
        group = words[i:i + gene_length]
        # Check if the first letter of each word in the group matches the letters in the gene symbol
        if all(group[j][0].lower() == gene_symbol[j].lower() for j in range(gene_length)):
            # If the group matches, add it to the list as a string
            matching_groups.append(' '.join(group))

    return matching_groups

In [None]:
test_abstract = 'We describe a novel nuclear factor called mitotic chromosome-associated protein (MCAP), which belongs to the poorly understood BET subgroup of the bromodomain superfamily. Expression of the 200-kDa MCAP was linked to cell division, as it was induced by growth stimulation and repressed by growth inhibition. The most notable feature of MCAP was its association with chromosomes during mitosis, observed at a time when the majority of nuclear regulatory factors were released into the cytoplasm, coinciding with global cessation of transcription. Indicative of its predominant interaction with euchromatin, MCAP localized on mitotic chromosomes with exquisite specificity: (i) MCAP-chromosome association became evident subsequent to the initiation of histone H3 phosphorylation and early chromosomal condensation; and (ii) MCAP was absent from centromeres, the sites of heterochromatin. Supporting a role for MCAP in G(2)/M transition, microinjection of anti-MCAP antibody into HeLa cell nuclei completely inhibited the entry into mitosis, without abrogating the ongoing DNA replication. These results suggest that MCAP plays a role in a process governing chromosomal dynamics during mitosis.'

gene_symbol = 'CAP'

find_matching_groups(test_abstract,gene_symbol)


['chromosome associated protein']

In [None]:
possible_matches = []
for abstract in abstracts_to_check:
    possible_match = {}

['Translocation t(15;19)(q13;p13.1) defines a lethal midline carcinoma arising adjacent to respiratory tract in young people. To characterize molecular alterations responsible for the distinctly aggressive biological behavior of this cancer, we mapped the chromosome 15 and 19 translocation breakpoints by fluorescence in situ hybridization (FISH) and Southern blotting. To evaluate preliminarily the frequency, anatomical distribution, and histological features of t(15;19) cancer, we developed a FISH assay for paraffin sections. Our findings reveal a novel oncogenic mechanism in which the chromosome 19 translocation breakpoint interrupts the coding sequence of a bromodomain gene, BRD4. These studies implicate BRD4 as a potential partner in a t(15;19)-associated fusion oncogene. In addition, we localized the chromosome 15 breakpoint to a 9-kb region in each of two cases, thereby identifying several candidate oncogenes which might represent the BRD4 fusion partner. FISH evaluation of 13 ped

In [None]:
import spacy

def find_matching_groups_spacy(abstract, gene_symbol):
    # Load the English tokenizer, tagger, parser, NER, and word vectors
    nlp = spacy.load("en_core_web_sm")
    
    # Process the abstract text with spaCy
    doc = nlp(abstract)
    
    # Create a list of words (tokens) from the document
    words = [token.text for token in doc if not token.is_punct and not token.is_space]
    
    # Determine the length of the gene symbol
    gene_length = len(gene_symbol)
    # Initialize a list to store matching groups
    matching_groups = []
    
    # Loop through the words to form groups of size gene_length
    for i in range(len(words) - gene_length + 1):
        # Select the current group of words
        group = words[i:i + gene_length]
        # Check if the first letter of each word in the group matches the letters in the gene symbol
        if all(group[j][0].lower() == gene_symbol[j].lower() for j in range(gene_length)):
            # If the group matches, add it to the list as a string
            matching_groups.append(' '.join(group))
    
    return matching_groups

In [None]:
find_matching_groups_spacy(test_abstract,gene_symbol)

['chromosome associated protein']

## Test Run w/ Anastasia Gene Lists

Given some sample gene lists, grab a uniprot identifier and use it to query for alternative names and pmids related to the parent gene. Once the alternative names and pmids are grabbed, use the PMID to query pubmed for the publication abstract. Use the obtained abstract and spaCy to do a quick check for symbol first letter matching (i.e. do triplets of words from an abstract start with C A P to match CAP). More sophisticated methods can be employed to check for correct expansion of words.

In [55]:
import spacy
from tqdm import tqdm
import pandas as pd

def grab_uniprot_id(gene_symbol):
    query = f'https://normalize.cancervariants.org/gene/normalize?q={gene_symbol}'
    r = requests.get(query)
    try:
        uniprot_id = None
        for extension in r.json()['gene_descriptor']['extensions']:
            if extension['name'] == 'associated_with':
                uniprot_id = [value.replace('uniprot:','') for value in extension['value'] if value.startswith('uniprot:')]
                pass
                
    except:
        uniprot_id = 'Yikes'
    return uniprot_id

def uniprot_request(uniprot_id):
    uniprot_data = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot_id}?format=json')
    return uniprot_data    

def grab_alternative_names(uniprot_data):
    alt_names = []
    try:
        for name in uniprot_data.json()['proteinDescription']['alternativeNames']:
            alt_names.append(name['fullName']['value'])
    except:
        alt_names = None
    return(alt_names)

def grab_pmids(uniprot_data):
    pmids_to_check = []
    for pmid in uniprot_data.json()['references']:
        pmids_to_check.append(pmid['citation']['id'])
    return(pmids_to_check)

def get_pubmed_abstract(pmid, api_key='YOUR_API_KEY'):
    """ Fetch the abstract for a given PMID from PubMed. """
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pmid,
        'retmode': 'xml'
        # 'api_key': api_key
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        # Parse the XML response
        from xml.etree import ElementTree
        root = ElementTree.fromstring(response.text)
        
        # Find the AbstractText element and return its text
        abstract_text = root.find('.//AbstractText')
        if abstract_text is not None:
            return abstract_text.text
        else:
            return "No abstract found."
    else:
        return f"Error: {response.status_code}"

def find_matching_groups(abstract, gene_symbol):
    # Load the English tokenizer, tagger, parser, NER, and word vectors
    nlp = spacy.load("en_core_web_sm")
    
    # Process the abstract text with spaCy
    doc = nlp(abstract)

    abstract = abstract.replace('-',' ')
    print(abstract)
    # Create a list of words (tokens) from the document
    words = [token.text for token in doc if not token.is_punct and not token.is_space]
    
    # Determine the length of the gene symbol
    gene_length = len(gene_symbol)

    # Initialize a list to store matching groups
    matching_groups = []
    
    # Loop through the words to form groups of size gene_length
    for i in range(len(words) - gene_length + 1):
        # Select the current group of words
        group = words[i:i + gene_length]
        # Check if the first letter of each word in the group matches the letters in the gene symbol
        if all(group[j][0].lower() == gene_symbol[j].lower() for j in range(gene_length)):
            # If the group matches, add it to the list as a string
            matching_groups.append(' '.join(group))
    
    return matching_groups

In [25]:
data = {'collision_symbol': ['CAP','CAP','MYM','ASP','ASP','ASP','ALP','ALP','ALP'],
        'parent_symbol': ['BRD4','LNPEP','ZMYM1','ASIP','TMPRSS11D','ASPM','ATHS','CCL27','ALPI'],
        'gene_alias_pair': ['BRD4-CAP','LNPEP-CAP','ZMYM1-MYM','ASIP-ASP','TMPRSS11D-ASP','ASPM-ASP','ATHS-ALP','CCL27-ALP','ALPI-ALP'],
        'correct_alias_expansion': ["Chromosome Associated Protein","Cystinyl Aminopeptidase","Myeloproloferative syndrome and mental retardation","Agouti Signaling Protein","Adrenal secretory serine protease","Drosophila abnormal spindle","Atherogenic Lipoprotein Phenotype","Alkaline Phosphatase","Antileukoproteinase"],
        'collision_association': ['Protein Product','Protein Product','Disease', 'Protein Product', 'Protein Product', 'Ortholog', 'Phenotype', 'Protein Product', 'Protein Product'],
        'curation_difficulty': ['easy','medium','hard','easy','medium','hard','easy','medium','hard']}
df = pd.DataFrame(data)
df

Unnamed: 0,collision_symbol,parent_symbol,gene_alias_pair,correct_alias_expansion,collision_association,curation_difficulty
0,CAP,BRD4,BRD4-CAP,Chromosome Associated Protein,Protein Product,easy
1,CAP,LNPEP,LNPEP-CAP,Cystinyl Aminopeptidase,Protein Product,medium
2,MYM,ZMYM1,ZMYM1-MYM,Myeloproloferative syndrome and mental retarda...,Disease,hard
3,ASP,ASIP,ASIP-ASP,Agouti Signaling Protein,Protein Product,easy
4,ASP,TMPRSS11D,TMPRSS11D-ASP,Adrenal secretory serine protease,Protein Product,medium
5,ASP,ASPM,ASPM-ASP,Drosophila abnormal spindle,Ortholog,hard
6,ALP,ATHS,ATHS-ALP,Atherogenic Lipoprotein Phenotype,Phenotype,easy
7,ALP,CCL27,CCL27-ALP,Alkaline Phosphatase,Protein Product,medium
8,ALP,ALPI,ALPI-ALP,Antileukoproteinase,Protein Product,hard


In [83]:
# Main 
import time

# Initialize
df['test_results_alt_names'] = None
df['test_results_pmids'] = None
df['uniprot_id'] = df['parent_symbol'].apply(grab_uniprot_id)

# Grab PMIDS and Alternative Names
for index,row in df.iterrows():
    print(f'Retrieving uniprot ref data for {row["parent_symbol"]}')
    if not row['uniprot_id']:
        continue
    data = uniprot_request(row['uniprot_id'][0])    
    alt_names = grab_alternative_names(data)
    df.at[index,'test_results_alt_names'] = alt_names
    pmids = grab_pmids(data)

    # Grab Abstracts from PubMed
    print(f'Grabing abstracts for {row["parent_symbol"]}')
    abstracts_to_check = []
    for pmid in tqdm(pmids):
        entry = {}
        if pmid.startswith('CI'):
            continue
        else:
            time.sleep(1) # Pubmed will throw back "too many requests" error unless you space them out or find a way to batch them
            abstract = get_pubmed_abstract(pmid)
            entry[pmid] = abstract
            abstracts_to_check.append(entry)

    # Check Abstracts using spaCy
    print(f'Checking abstracts for {row["parent_symbol"]}')    
    possible_matches = []
    for abstract_dict in abstracts_to_check:
        keys = list(abstract_dict.keys())
        possible_match = {}
        for key in keys:
            result = find_matching_groups(abstract_dict[key],row['collision_symbol'])
            possible_match[key] = result
        possible_matches.append(possible_match)

    df.at[index,'test_results_pmids'] = possible_matches
    
df

Retrieving uniprot ref data for BRD4
Checking abstracts for BRD4


100%|██████████| 58/58 [01:11<00:00,  1.24s/it]


Retrieving uniprot ref data for LNPEP
Checking abstracts for LNPEP


100%|██████████| 15/15 [00:18<00:00,  1.21s/it]


Retrieving uniprot ref data for ZMYM1
Checking abstracts for ZMYM1


100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


Retrieving uniprot ref data for ASIP
Checking abstracts for ASIP


100%|██████████| 7/7 [00:08<00:00,  1.28s/it]


Retrieving uniprot ref data for TMPRSS11D
Checking abstracts for TMPRSS11D


100%|██████████| 5/5 [00:06<00:00,  1.28s/it]


Retrieving uniprot ref data for ASPM
Checking abstracts for ASPM


100%|██████████| 19/19 [00:22<00:00,  1.21s/it]


Retrieving uniprot ref data for ATHS
Retrieving uniprot ref data for CCL27
Checking abstracts for CCL27


100%|██████████| 6/6 [00:06<00:00,  1.06s/it]


Retrieving uniprot ref data for ALPI
Checking abstracts for ALPI


100%|██████████| 12/12 [00:14<00:00,  1.17s/it]


Unnamed: 0,collision_symbol,parent_symbol,gene_alias_pair,correct_alias_expansion,collision_association,curation_difficulty,test_results,uniprot_id,test_results_alt_names,test_results_pmids
0,CAP,BRD4,BRD4-CAP,Chromosome Associated Protein,Protein Product,easy,,[O60885],[Protein HUNK1],"[{'11733348': []}, {'15057824': []}, {'1548933..."
1,CAP,LNPEP,LNPEP-CAP,Cystinyl Aminopeptidase,Protein Product,medium,,[Q9UIQ6],"[Insulin-regulated membrane aminopeptidase, In...","[{'8550619': []}, {'9177475': []}, {'10759854'..."
2,MYM,ZMYM1,ZMYM1-MYM,Myeloproloferative syndrome and mental retarda...,Disease,hard,,[Q5SVZ6],,"[{'17974005': []}, {'16710414': []}, {'2575529..."
3,ASP,ASIP,ASIP-ASP,Agouti Signaling Protein,Protein Product,easy,,[P42127],[Agouti switch protein],"[{'7937887': []}, {'7757071': ['Agouti Signali..."
4,ASP,TMPRSS11D,TMPRSS11D-ASP,Adrenal secretory serine protease,Protein Product,medium,,[O60235],[Airway trypsin-like protease],"[{'9565616': []}, {'15489334': ['a saturation ..."
5,ASP,ASPM,ASPM-ASP,Drosophila abnormal spindle,Ortholog,hard,,[Q8IZT6],[Abnormal spindle protein homolog],"[{'12355089': []}, {'14704186': []}, {'1597272..."
6,ALP,ATHS,ATHS-ALP,Atherogenic Lipoprotein Phenotype,Phenotype,easy,,[],,
7,ALP,CCL27,CCL27-ALP,Alkaline Phosphatase,Protein Product,medium,,[Q9Y4X3],"[CC chemokine ILC, Cutaneous T-cell-attracting...","[{'10556532': []}, {'10588729': []}, {'1072569..."
8,ALP,ALPI,ALPI-ALP,Antileukoproteinase,Protein Product,hard,,[P09923],,"[{'3468508': []}, {'3469665': []}, {'2841341':..."


In [101]:
spot_check = 3

print(df['test_results_alt_names'][spot_check])

print(df['test_results_pmids'][spot_check])


['Agouti switch protein']
[{'7937887': []}, {'7757071': ['Agouti Signaling Protein']}, {'11780052': []}, {'15489334': ['a saturation point']}, {'15701517': ['agouti signaling protein']}, {'11833005': ['agouti signaling protein', 'agouti signaling protein']}, {'36536132': ['agouti signaling protein', 'a similar phenotype']}]
