In [29]:
import yaml
import json
from typing import Any, List, Dict
import pandas as pd

In [30]:
%load_ext yamlmagic

The yamlmagic extension is already loaded. To reload it, use:
  %reload_ext yamlmagic


### Sample Record

In [31]:
%%writefile alias-alias_collision_records/sample_collision_record.yaml

collision_symbol: ''
collision_class: "Convergent Acronym", "Gene Family", "Protein Domain", "Expired Gene Symbol", "Protein Product", "Protein Product Weight"
collision_type: "alias-alias", "alias-primary"
collision_group:
  - gene_symbol: ''
    ensg_id: 'ENSG'
    collision_acronym_expansion: ''
    collision_gene_relationship: If primary gene symbol "Primary gene symbol" If alias "Disease", "Gene Family", "Protien Domain", "Expired Gene Symbol", "Protein Product", "Phenotype"
    collision_source:
      - PMID: ''
      - PMID: ''


Overwriting alias-alias_collision_records/sample_collision_record.yaml


### ASP

In [32]:
%%writefile alias-alias_collision_records/ASP_collision_record.yaml

collision_symbol: 'ASP'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ASIP'
    ensg_id: 'ENSG00000101440'
    collision_acronym_expansion: 'Agouti Signaling Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '11833005'
      - PMID: '7757071'
  - gene_symbol: 'ASPA'
    ensg_id: 'ENSG00000108381'
    collision_acronym_expansion: 'ASPartoacylase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '9407392'
      - PMID: '10493853'
  - gene_symbol: 'ASPM'
    ensg_id: 'ENSG00000066279'
    collision_acronym_expansion: 'Drosophila Abnormal SPindle'
    collision_gene_relationship: 'Ortholog'
    collision_source:
      - PMID: '36980263'
      - PMID: '11283617'
  - gene_symbol: 'ATG5'
    ensg_id: 'ENSG00000057663'
    collision_acronym_expansion: 'Apoptosis-Specific Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '9563500'
      - PMID: '7796880'
  - gene_symbol: 'A1CF'
    ensg_id: 'ENSG00000148584'
    collision_acronym_expansion: 'APOBEC-1 Stimulating Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '10781591'
      - PMID: '15451168'
  - gene_symbol: 'C3'
    ensg_id: 'ENSG00000125730'
    collision_acronym_expansion: 'Acylation-Stimulating Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '18805911'
      - PMID: '15833747'
  - gene_symbol: 'ROPN1L'
    ensg_id: 'ENSG00000145491'
    collision_acronym_expansion: 'AKAP-associated Sperm Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '18421703'
      - PMID: '25600306'
  - gene_symbol: 'TMPRSS11D'
    ensg_id: 'ENSG00000153802'
    collision_acronym_expansion: 'Adrenal secretory Serine Protease'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '15762198'
      - PMID: '11741986'

Overwriting alias-alias_collision_records/ASP_collision_record.yaml


In [33]:
ASP_collision_record = yaml.safe_load(open('alias-alias_collision_records/ASP_collision_record.yaml'))
print(ASP_collision_record)

{'collision_symbol': 'ASP', 'collision_class': 'Convergent Acronym', 'collision_type': 'alias-alias', 'collision_group': [{'gene_symbol': 'ASIP', 'ensg_id': 'ENSG00000101440', 'collision_acronym_expansion': 'Agouti Signaling Protein', 'collision_gene_relationship': 'Protein Product', 'collision_source': [{'PMID': '11833005'}, {'PMID': '7757071'}]}, {'gene_symbol': 'ASPA', 'ensg_id': 'ENSG00000108381', 'collision_acronym_expansion': 'ASPartoacylase', 'collision_gene_relationship': 'Function', 'collision_source': [{'PMID': '9407392'}, {'PMID': '10493853'}]}, {'gene_symbol': 'ASPM', 'ensg_id': 'ENSG00000066279', 'collision_acronym_expansion': 'Drosophila Abnormal SPindle', 'collision_gene_relationship': 'Ortholog', 'collision_source': [{'PMID': '36980263'}, {'PMID': '11283617'}]}, {'gene_symbol': 'ATG5', 'ensg_id': 'ENSG00000057663', 'collision_acronym_expansion': 'Apoptosis-Specific Protein', 'collision_gene_relationship': 'Protein Product', 'collision_source': [{'PMID': '9563500'}, {'PM

### PAP

In [34]:
%%writefile alias-alias_collision_records/PAP_collision_record.yaml

collision_symbol: 'PAP'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ACP3'
    ensg_id: 'ENSG00000014257'
    collision_acronym_expansion: 'Prostatic Acid Phosphotase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '1375464'
      - PMID: '2842184'
  - gene_symbol: 'ASAP1'
    ensg_id: 'ENSG00000153317'
    collision_acronym_expansion: 'Putative Alternative Promoters'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '16344560'
  - gene_symbol: 'ASAP2'
    ensg_id: 'ENSG00000151693'
    collision_acronym_expansion: 'Paxillin-Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '10749932'
  - gene_symbol: 'MRPS30'
    ensg_id: 'ENSG00000112996'
    collision_acronym_expansion: 'Prostatic Acid Phosphatase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '20084276'
  - gene_symbol: 'PAPOLA'
    ensg_id: 'ENSG00000090060'
    collision_acronym_expansion: 'Poly(A) Polymerase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '19224921'
      - PMID: '8302877'
  - gene_symbol: 'PDAP1'
    ensg_id: 'ENSG00000106244'
    collision_acronym_expansion: 'PDGF Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '8780057'
      - PMID: '9570154'
        note: 'This publication uses PAP as Phosphatidic Acid Phosphatase. Which is it? Is it both?'
  - gene_symbol: 'REG3A'
    ensg_id: 'ENSG00000172016'
    collision_acronym_expansion: 'Pancreatitis-Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '8188210'
      - PMID: '8076648'
  - gene_symbol: 'TUSC2'
    ensg_id: 'ENSG00000114383'
    collision_acronym_expansion: 'PDGFA-Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '8780057'



Overwriting alias-alias_collision_records/PAP_collision_record.yaml


### U4

In [35]:
%%writefile alias-alias_collision_records/U4_collision_record.yaml

collision_symbol: 'U4'
collision_class: 'Gene Family'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'RNU4-1'
    ensg_id: 'ENSG00000200795'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
      - PMID: '3582982'
      - PMID: '26275778'
  - gene_symbol: 'RNU4-3P'
    ensg_id: 'NaN'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2582241'
  - gene_symbol: 'RNU4-4P'
    ensg_id: 'ENSG00000201458'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2582241'
  - gene_symbol: 'RNU4-5P'
    ensg_id: 'ENSG00000272160'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2582241'
  - gene_symbol: 'RNU4-6P'
    ensg_id: 'ENSG00000222736'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2582241'
  - gene_symbol: 'RNU4-7P'
    ensg_id: 'ENSG00000201628'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2583518'
  - gene_symbol: 'RNU4-8P'
    ensg_id: 'ENSG00000201806'
    collision_acronym_expansion: 'U-RNA 4'
    collision_gene_relationship: 'Gene Family'
    collision_source:
      - PMID: '2583518'

Overwriting alias-alias_collision_records/U4_collision_record.yaml


### CAP

In [36]:
%%writefile alias-alias_collision_records/CAP_collision_record.yaml

collision_symbol: 'CAP'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'BRD4'
    ensg_id: 'ENSG00000141867'
    collision_acronym_expansion: 'Chromosome Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '8849894'
  - gene_symbol: 'CAP1'
    ensg_id: 'ENSG00000131236'
    collision_acronym_expansion: 'Cyclase Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '15311924'
      - PMID: '8761950'
  - gene_symbol: 'CTAA1'
    ensg_id: 'NAN'
    collision_acronym_expansion: 'Cataract, Anterior Polar'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '1049206'
  - gene_symbol: 'HACD1'
    ensg_id: 'ENSG00000165996'
    collision_acronym_expansion: 'Cementum Attachment Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '25263524'
      - PMID: '22067203'
  - gene_symbol: 'LNPEP'
    ensg_id: 'ENSG00000113441'
    collision_acronym_expansion: 'Cystinyl AminoPeptidase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '17692401'
      - PMID: '16113565'
  - gene_symbol: 'SERPINB6'
    ensg_id: 'ENSG00000124570'
    collision_acronym_expansion: 'Cytoplasmic Antiproteinase'
    collision_gene_relationship: 'Function'
    collision_source:
      - PMID: '8636153'
      - PMID: '8662739'
  - gene_symbol: 'SORBS1'
    ensg_id: 'ENSG00000095637'
    collision_acronym_expansion: 'C-Cbl Associated Protein'
    collision_gene_relationship: 'Protein Product'
    collision_source:
      - PMID: '11532984'
      - PMID: '12849814'

Overwriting alias-alias_collision_records/CAP_collision_record.yaml


### MYM

In [37]:
%%writefile alias-alias_collision_records/MYM_collision_record.yaml

collision_symbol: 'MYM'
collision_class: 'Protein Domain'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ZMYM1'
    ensg_id: 'ENSG00000197056'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'
  - gene_symbol: 'ZMYM2'
    ensg_id: 'ENSG00000121741'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'
  - gene_symbol: 'ZMYM3'
    ensg_id: 'ENSG00000147130'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'
  - gene_symbol: 'ZMYM4'
    ensg_id: 'ENSG00000146463'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'
  - gene_symbol: 'ZMYM5'
    ensg_id: 'ENSG00000132950'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'
  - gene_symbol: 'ZMYM6'
    ensg_id: 'ENSG00000163867'
    collision_acronym_expansion: 'MYeloproliferative syndrome and Mental retardation'
    collision_gene_relationship: 'Disease'
    collision_source:
      - PMID: '4136804'

Overwriting alias-alias_collision_records/MYM_collision_record.yaml


### F379

In [38]:
%%writefile alias-alias_collision_records/F379_collision_record.yaml

collision_symbol: 'F379'
collision_class: 'Gene Family'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'FAM138A'
    ensg_id: 'ENSG00000237613'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'
  - gene_symbol: 'FAM138B'
    ensg_id: 'ENSG00000226516'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'
      - PMID: '12421752'
  - gene_symbol: 'FAM138C'
    ensg_id: 'ENSG00000218839'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'
  - gene_symbol: 'FAM138D'
    ensg_id: 'ENSG00000249054'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'
  - gene_symbol: 'FAM138E'
    ensg_id: 'ENSG00000248893'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'
      - PMID: '35387070'
  - gene_symbol: 'FAM138F'
    ensg_id: 'ENSG00000282591'
    collision_acronym_expansion: ''
    collision_gene_relationship: 'Expired Gene Symbol'
      - PMID: '11779631'

Overwriting alias-alias_collision_records/F379_collision_record.yaml


ALP1

In [39]:
%%writefile alias-alias_collision_records/ALP1_collision_record.yaml

collision_symbol: 'ALP1'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ASRGL1'
    ensg_id: 'ENSG00000162174'
    - collision_acronym_expansion: 'ALkaline Phosphatase'
      collision_gene_relationship: 'Function'
        - PMID: '7602772'
        - PMID: '8102225'
    - collision_acronym_expansion: 'Asparaginase-Like Protein'
      collision_gene_ralationship: 'Protein Product'
        - PMID: '11984834'
  - gene_symbol: 'ASZ1'
    ensg_id: 'ENSG00000154438'
    collision_acronym_expansion: 'Ankyrin-Like Protein 1'
    collision_gene_relationship: 'Protein Product'
      - PMID: ''

Overwriting alias-alias_collision_records/ALP1_collision_record.yaml


In [40]:
%%writefile alias-alias_collision_records/P40_collision_record.yaml

collision_symbol: 'P40'
collision_class: 'Protein Product Weight'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ARHGEF2'
    ensg_id: 'ENSG00000116584'
    collision_acronym_expansion: 'Protein of 40 kDa'
    collision_gene_relationship: 'Protien Product'
      - PMID: '2466560'
  - gene_symbol: 'EBNA1BP2'
    ensg_id: 'ENSG00000117395'
    collision_acronym_expansion: 'Protein of 40 kDa'
    collision_gene_relationship: 'Protien Product'
      - PMID: '2879624'
      - PMID: '12549186'
  - gene_symbol: 'IL9'
    ensg_id: 'ENSG00000145839'
    collision_acronym_expansion: 'Protein of 40 kDa'
    collision_gene_relationship: 'Protien Product'
      - PMID: '1971295'
      - PMID: '2129501'
  - gene_symbol: 'PSMD7'
    ensg_id: 'ENSG00000103035'
    collision_acronym_expansion: 'Protein of 40 kDa'
    collision_gene_relationship: 'Protien Product'
      - PMID: '7755639'

Overwriting alias-alias_collision_records/P40_collision_record.yaml


ALP

In [41]:
%%writefile alias-alias_collision_records/ALP_collision_record.yaml

collision_symbol: 'ALP'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ALPP'
    ensg_id: 'ENSG00000163283'
    collision_acronym_expansion: 'ALkaline Phosphatase'
    collision_gene_relationship: 'Function'
      - PMID: '3042787'
      - PMID: '37952310'
 - gene_symbol: 'ASRGL1'
    ensg_id: 'ENSG00000162174'
    - collision_acronym_expansion: 'ALkaline Phosphatase'
      collision_gene_relationship: 'Function'
        - PMID: '7602772'
        - PMID: '8102225'
    - collision_acronym_expansion: 'Asparaginase-Like Protein'
      collision_gene_ralationship: 'Protein Product'
        - PMID: '11984834'
  - gene_symbol: 'ATHS'
    ensg_id: 'NAN'
    collision_acronym_expansion: 'Atherogenic Lipoprotein Phenotype'
    collision_gene_relationship: 'Phenotype'
      - PMID: '1731344'
      - PMID: '19498345'
  - gene_symbol: 'ATRNL1'
    ensg_id: 'ENSG00000107518	'
    collision_acronym_expansion: 'Attractin-Like Protein'
    collision_gene_relationship: 'Protein Product'
      - PMID: '14531729'
      - PMID: '14656215'
  - gene_symbol: 'CCL27'
    ensg_id: 'ENSG00000213927	'
    collision_acronym_expansion: 'ALkaline Phosphatase'
    collision_gene_relationship: 'Function'
      - PMID: '20150512'
      - PMID: '20362069'
  - gene_symbol: 'NAT10'
    ensg_id: 'ENSG00000135372	'
    collision_acronym_expansion: 'Acetyltransferase-Like Protein'
    collision_gene_relationship: 'Protein Product'
      - PMID: '14592445'
      - PMID: '17631499'
  - gene_symbol: 'PDLIM3'
    ensg_id: 'ENSG00000154553	'
    collision_acronym_expansion: 'Actinin-associated LIM Protein'
    collision_gene_relationship: 'Protein Product'
      - PMID: '10063829'
      - PMID: '9334352'
  - gene_symbol: 'SLPI'
    ensg_id: 'ENSG00000124107	'
    collision_acronym_expansion: 'AntiLeukoProteinase'
    collision_gene_relationship: 'Function'
      - PMID: '18976018'
      - PMID: '15248236'


Overwriting alias-alias_collision_records/ALP_collision_record.yaml


NAP1

In [42]:
%%writefile alias-alias_collision_records/NAP1_collision_record.yaml

collision_symbol: 'NAP1'
collision_class: 'Convergent Acronym'
collision_type: 'alias-alias'
collision_group:
  - gene_symbol: 'ACOT8'
    ensg_id: 'ENSG00000101473'
    collision_acronym_expansion: 'Nef (Lentivirus Myristoylated Factor) Associated Protein 1'
    collision_gene_relationship: 'Protein Product'
  - gene_symbol: 'AZI2'
    ensg_id: 'ENSG00000163512'
    collision_acronym_expansion: 'NF-Kappa-B-Activating Kinase-Associated Protein 1'
    collision_gene_relationship: 'Protein Product'
      - PMID: '15611223'
      - PMID: '14560022'
  - gene_symbol: 'CXCL8'
    ensg_id: 'ENSG00000169429'
    - collision_acronym_expansion: 'Neutrophil-Activating Peptide 1'
      collision_gene_relationship: 'Function'
        - PMID: '17196571'
    - collision_acronym_expansion: 'NF-κB-Activating kinase-associated Protein 1'
      collision_gene_relationship: 'Protein Product'
        - PMID: '29290284'
    - collision_acronym_expansion: 'Nucleosome Assembly Protein 1'
      collision_gene_relationship: 'Protein Product'
        - PMID: '19339032'
    - collision_acronym_expansion: 'Nck-Associated Protein 1'
      collision_gene_relationship: 'Protein Product'
        - PMID: '18644894'
  - gene_symbol: 'NAA25'
    ensg_id: 'ENSG00000111300'
    collision_acronym_expansion: ''
    collision_gene_relationship: ''
  - gene_symbol: 'NAP1L1'
    ensg_id: 'ENSG00000187109'
    collision_acronym_expansion: 'Nucleosome Assembly Protein 1'
    collision_gene_relationship: 'Protein Product'
      - PMID: '8297347'
      - PMID: '20002496'
  - gene_symbol: 'NAPSA'
    ensg_id: 'ENSG00000131400'
    collision_acronym_expansion: 'Nck-Associated Protein'
    collision_gene_relationship: 'Protein Product'
      - PMID: '15048123'
  - gene_symbol: 'NCKAP1'
    ensg_id: 'ENSG00000061676'
    - collision_acronym_expansion: 'Nck-Associated Protein 1'
      collision_gene_relationship: 'Protein Product'
        - PMID: '30867003'
        - PMID: '9148763'
    - collision_acronym_expansion: 'NAK-associated protein 1'
      collision_gene_relationship: 'Protein Product'
        - PMID: '14560022'
  - gene_symbol: 'TAB3'
    ensg_id: 'ENSG00000157625'
    collision_acronym_expansion: 'NF-Kappa-B-Activating Protein 1'
    collision_gene_relationship: 'Protein Product'
  - gene_symbol: 'TRMO'
    ensg_id: 'ENSG00000136932'
    collision_acronym_expansion: 'Nef-Associated Protein 1 '
    collision_gene_relationship: 'Protein Product'

Overwriting alias-alias_collision_records/NAP1_collision_record.yaml
