# Notebook for parser development

In [1]:
## not for parser. for notebook only 

## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Loading data

Current approach: loading all files into 1 pandas dataframe. Then I can...
1. check the duplicates situation (key columns vs all columns) and raise errors if need be
2. remove duplicates before generating documents
3. Do some tasks column-wise over all the data, rather than while iterating over rows

If I did the generator approach (load files 1 by 1, 1 row at a time), I'd have to modify how I do things:
1. Don't do this check/raise errors. But try to mitigate potential "duplicate" issues: 
  * Sort all delimited strings
  * Use a hash of all column values (when they're all strings) for `_id`. Want rows with all the same values to produce the same hash
2. Either leave to BioThings toolset to remove duplicates, or could use a set of `_id` hashes so far to check/not create duplicate docs.
3. Do the tasks on single rows/chunks (pandas [read_csv](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas.read_csv) has an iterator for rows/chunks! see iterator/chunksize parameters)


Notes:
* There are a few existing parsers that use `pandas` to load the entire raw data file at once: https://github.com/search?q=repo%3Abiothings%2Fpending.api%20pandas&type=code
* But there are other existing parsers that use `csv` to load the file **one row at a time** (generator): https://github.com/search?q=repo%3Abiothings%2Fpending.api+csv+reader&type=code

In [2]:
## put into parser: already done

## python 3.12.9
import pathlib       ## 1.0.1 
import pandas as pd  ## 2.2.3 

## don't put in parser. Just for this notebook
import glob
from pprint import pprint

## unsure on putting into parser: more for notebook viewing/debugging...
pd.options.display.max_columns = None

Originally written to explore 2025-02-28 release to FTP site 

In [3]:
## put into parser (format): DONE

base_file_path = pathlib.Path.home().joinpath("Desktop", "EBIgene2pheno_files", "From_FTP")

## pathlib's Path.glob produces a generator, vs glob.glob produces an array from cwd
glob.glob("*2025-02-28.csv.gz")
base_file_path.glob("*2025-02-28.csv.gz")

## using list works to check if paths matching pattern were actually found or not
all_file_paths = list(base_file_path.glob("*2025-02-28.csv.gz"))
all_file_paths

[]

<generator object Path.glob at 0x10adfa8a0>

[PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/CardiacG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/SkeletalG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/DDG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/SkinG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/Hearing_lossG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/CancerG2P_2025-02-28.csv.gz'),
 PosixPath('/Users/colleenxu/Desktop/EBIgene2pheno_files/From_FTP/EyeG2P_2025-02-28.csv.gz')]

In [4]:
## put into parser (format): DONE


## ingest all columns as str
df = pd.concat((pd.read_csv(f, dtype=str) for f in all_file_paths), ignore_index=True)

## make column names snake-case - usable with itertuples later
df.columns = df.columns.str.replace(" ", "_")

df["date_of_last_review"].info(memory_usage="deep")
print("\n")
## change this column to datetime, saves memory
df["date_of_last_review"] = pd.to_datetime(df["date_of_last_review"])
df["date_of_last_review"].info(memory_usage="deep")

<class 'pandas.core.series.Series'>
RangeIndex: 4714 entries, 0 to 4713
Series name: date_of_last_review
Non-Null Count  Dtype 
--------------  ----- 
4714 non-null   object
dtypes: object(1)
memory usage: 377.6 KB


<class 'pandas.core.series.Series'>
RangeIndex: 4714 entries, 0 to 4713
Series name: date_of_last_review
Non-Null Count  Dtype              
--------------  -----              
4714 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1)
memory usage: 37.0 KB


In [None]:
## worked before (pandas 2.0.3 and python 3.11.4) to ingest column as datetime
## now it doesn't: ingests as object/str instead
# df = pd.concat((pd.read_csv(f, dtype=str, parse_dates=["date of last review"]) 
#                 for f in all_file_paths), ignore_index=True)

## the following also don't work
# df = pd.concat((pd.read_csv(f, dtype=str, parse_dates=["date of last review"], 
#                            date_format="%Y-%m-%d %H:%M:%S%:z") 
#                 for f in all_file_paths), ignore_index=True)
## throws an error
# df = pd.concat((pd.read_csv(f, dtype=str, parse_dates=[["date of last review"]], 
#                            date_format="%Y-%m-%d %H:%M:%S%:z") 
#                 for f in all_file_paths), ignore_index=True)
## throws an error
# df = pd.concat((pd.read_csv(f, dtype={"date of last review": pd.datetime64[ns, tz]})
#                 for f in all_file_paths), ignore_index=True)

In [5]:
df.shape
df.head()
df.info(memory_usage="deep")

(4714, 21)

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review
0,G2P00124,KCNE1,176261,6240,ISK; JLNS2; LQT5; MINK,KCNE1-related Jervell and Lange-Nielsen syndrome,612347.0,,biallelic_autosomal,potential secondary finding,strong,altered gene product structure,missense_variant; inframe_deletion; stop_gaine...,undetermined,inferred,,HP:0000407; HP:0001657; HP:0000007; HP:0001279,30461122,DD; Cardiac,KCNE1-related JLNS is due to altered gene prod...,2024-04-05 12:05:01+00:00
1,G2P00841,PTPN11,176876,9644,BPTP3; NS1; PTP2C; SH-PTP2; SHP-2; SHP2,PTPN11-related Noonan syndrome with multiple l...,151100.0,,monoallelic_autosomal,,definitive,altered gene product structure,missense_variant; inframe_deletion; inframe_in...,undetermined,inferred,,HP:0000325; HP:0002996; HP:0000957; HP:0001709...,27484170; 26377839; 25917897; 25884655; 248207...,DD; Skin; Cardiac,Expert review done on 12/01/2022; Noonan syndr...,2025-01-21 14:56:43+00:00
2,G2P03247,DSC2,125645,3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:36:09+00:00
3,G2P03248,DSC2,125645,3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,biallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:35:19+00:00
4,G2P03249,DSG2,125671,3049,CDHF5,DSG2-related arrhythmogenic right ventricular ...,,MONDO:0012434,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; missense_variant; stop_gaine...,undetermined,inferred,,,21636032; 33831308; 33917638; 34400560; 240707...,Cardiac,Expert review done on 05/01/2022; DSG2-related...,2024-03-20 09:40:18+00:00


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4714 entries, 0 to 4713
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   g2p_id                              4714 non-null   object             
 1   gene_symbol                         4714 non-null   object             
 2   gene_mim                            4712 non-null   object             
 3   hgnc_id                             4714 non-null   object             
 4   previous_gene_symbols               4241 non-null   object             
 5   disease_name                        4714 non-null   object             
 6   disease_mim                         3574 non-null   object             
 7   disease_MONDO                       638 non-null    object             
 8   allelic_requirement                 4714 non-null   object             
 9   cross_cutting_modifier              629 n

## Checking, removing duplicates

In [6]:
## put into parser (format): DONE

## This is a data-quality check to make sure drop_duplicates actually removes all duplicates. 
## Based on exploring the data, the column subset below should uniquely define one 
##   record's data/row.
## If the de-deplicated data using this column set == de-duplicated data using the whole
##   dataset, then everything is fine and the parser can proceed with de-duplication.
## Else, the data needs to be explored and the parser probably needs adjustments.
## Many column values are delimited strings, and my concern is that these values could
##   differ (only in list order) for the "same data" in different files.

n_duplicates_column_combo = df[df.duplicated(subset=["g2p_id", "gene_symbol", "disease_name", "allelic_requirement", 
                                "molecular_mechanism"], keep=False)].shape

n_duplicates_all_columns = df[df.duplicated(keep=False)].shape

## for testing
# n_duplicates_all_columns = (1, 1)


if n_duplicates_column_combo != n_duplicates_all_columns: 
    raise AssertionError("The data format has changed, and record de-duplication may not work as-expected. " \
                          "Double-check the data and what columns uniquely define one record")

In [7]:
## put into parser (format): DONE

## drop duplicates
df.drop_duplicates(inplace=True, ignore_index=True)

In [8]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3647 entries, 0 to 3646
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   g2p_id                              3647 non-null   object             
 1   gene_symbol                         3647 non-null   object             
 2   gene_mim                            3645 non-null   object             
 3   hgnc_id                             3647 non-null   object             
 4   previous_gene_symbols               3277 non-null   object             
 5   disease_name                        3647 non-null   object             
 6   disease_mim                         2570 non-null   object             
 7   disease_MONDO                       561 non-null    object             
 8   allelic_requirement                 3647 non-null   object             
 9   cross_cutting_modifier              451 n

## Column-level transforms

In [None]:
df_diseasemim = df.copy()

## done to preserve NA
df_diseasemim["disease_mim"] = [i if pd.isna(i) \
                                else "OMIM:" + i if i.isnumeric() \
                                else i \
                                for i in df_diseasemim["disease_mim"]]

df_diseasemim["disease_mim"] = df_diseasemim["disease_mim"].str.replace("Orphanet", "orphanet")

In [None]:
df_diseasemim[df_diseasemim["disease_mim"].str.contains("OMIM:", na=False)].shape

df_diseasemim[df_diseasemim["disease_mim"].str.contains("orphanet:", na=False)].shape

## add up row count. If == num non-null in info above, you're good 
## right now 2570 == 2570, so good

In [9]:
## put into parser (format): DONE

## COLUMN-LEVEL TRANSFORMS

## adding Translator/biolink prefixes to IDs
df["gene_mim"] = "OMIM:" + df["gene_mim"]
df["hgnc_id"] = "HGNC:" + df["hgnc_id"]
df["disease_mim"] = df["disease_mim"].str.replace("Orphanet", "orphanet")
## done to preserve NA
df["disease_mim"] = [i if pd.isna(i) \
                     else "OMIM:" + i if i.isnumeric() \
                     else i \
                     for i in df["disease_mim"]]

## strip whitespace
df["disease_name"] = df["disease_name"].str.strip()
df["comments"] = df["comments"].str.strip()

## create new columns
## UI really wants resource website urls like this. May need to adjust over time as website changes
df["g2p_record_url"] = "https://www.ebi.ac.uk/gene2phenotype/lgd/" +  df["g2p_id"]

## replace panel keywords with full names shown on G2P website for single record
## keeping "Hearing loss" as-is, changing all other values
df["panel"] = df["panel"].str.replace("DD", "Developmental disorders")
df["panel"] = df["panel"].str.replace("Cancer", "Cancer disorders")
df["panel"] = df["panel"].str.replace("Cardiac", "Cardiac disorders")
df["panel"] = df["panel"].str.replace("Eye", "Eye disorders")
df["panel"] = df["panel"].str.replace("Skeletal", "Skeletal disorders")
df["panel"] = df["panel"].str.replace("Skin", "Skin disorders")

In [10]:
## checking on column-level transforms

df.head()
# df["g2p record url"].unique()[0:100]

# df[df["disease mim"].str.contains("orphanet", na=False)]  ## 9 rows, so that's correct
# df[df["panel"].str.contains("Hearing", na=False)]

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url
0,G2P00124,KCNE1,OMIM:176261,HGNC:6240,ISK; JLNS2; LQT5; MINK,KCNE1-related Jervell and Lange-Nielsen syndrome,OMIM:612347,,biallelic_autosomal,potential secondary finding,strong,altered gene product structure,missense_variant; inframe_deletion; stop_gaine...,undetermined,inferred,,HP:0000407; HP:0001657; HP:0000007; HP:0001279,30461122,Developmental disorders; Cardiac disorders,KCNE1-related JLNS is due to altered gene prod...,2024-04-05 12:05:01+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00124
1,G2P00841,PTPN11,OMIM:176876,HGNC:9644,BPTP3; NS1; PTP2C; SH-PTP2; SHP-2; SHP2,PTPN11-related Noonan syndrome with multiple l...,OMIM:151100,,monoallelic_autosomal,,definitive,altered gene product structure,missense_variant; inframe_deletion; inframe_in...,undetermined,inferred,,HP:0000325; HP:0002996; HP:0000957; HP:0001709...,27484170; 26377839; 25917897; 25884655; 248207...,Developmental disorders; Skin disorders; Cardi...,Expert review done on 12/01/2022; Noonan syndr...,2025-01-21 14:56:43+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00841
2,G2P03247,DSC2,OMIM:125645,HGNC:3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac disorders,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:36:09+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03247
3,G2P03248,DSC2,OMIM:125645,HGNC:3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,biallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac disorders,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:35:19+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03248
4,G2P03249,DSG2,OMIM:125671,HGNC:3049,CDHF5,DSG2-related arrhythmogenic right ventricular ...,,MONDO:0012434,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; missense_variant; stop_gaine...,undetermined,inferred,,,21636032; 33831308; 33917638; 34400560; 240707...,Cardiac disorders,Expert review done on 05/01/2022; DSG2-related...,2024-03-20 09:40:18+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03249


## Generating documents

In [None]:
for row in df.itertuples(index=False):
    if pd.notna(row.previous_gene_symbols):
        [i.strip() for i in row.previous_gene_symbols.split(";")]
        break

In [None]:
## put into parser (format) -> DONE. 
##   don't save in array, yield each document instead

## GENERATING DOCS, saving in array
documents = []

## using itertuples because it's faster, preserves datatypes
for row in df.itertuples(index=False):
    ## simple assignments
    document = {
        "_id": row.g2p_id,
        "subject": {
            "hgnc_symbol": row.gene_symbol,
            "hgnc": row.hgnc_id,
            "type": "Gene"
        },
        "association": {
            "g2p_record_id": row.g2p_id,
            "g2p_record_url": row.g2p_record_url,
            "allelic_requirement": row.allelic_requirement,
            "confidence": row.confidence,
            "molecular_mechanism": row.molecular_mechanism,
            "molecular_mechanism_categorisation": row.molecular_mechanism_categorisation,
            "g2p_panels": [i.strip() for i in row.panel.split(";")],
            "date_of_last_review": str(row.date_of_last_review)
        },
        "object": {
            "name": row.disease_name,
            "type": "Disease"
        }
    }    
    ## only create field if value is not NA
    ##   if value is NA, list comprehension with split won't work
    ## Gene
    if pd.notna(row.gene_mim):
        document["subject"]["omim"] = row.gene_mim
    if pd.notna(row.previous_gene_symbols):
        document["subject"]["previous_gene_symbols"] = \
        [i.strip() for i in row.previous_gene_symbols.split(";")]
        
    ## Association
    if pd.notna(row.cross_cutting_modifier):
        document["association"]["cross_cutting_modifiers"] = [i.strip() for i in row.cross_cutting_modifier.split(";")]
    if pd.notna(row.variant_consequence):
        document["association"]["variant_consequences"] = [i.strip() for i in row.variant_consequence.split(";")]
    if pd.notna(row.variant_types):
        document["association"]["variant_types"] = [i.strip() for i in row.variant_types.split(";")]
    ## uses diff delimiters, could do more parsing
    if pd.notna(row.molecular_mechanism_evidence):
        document["association"]["molecular_mechanism_evidence"] = [i.strip() for i in row.molecular_mechanism_evidence.split("&")]
    if pd.notna(row.phenotypes):
        document["association"]["phenotypes"] = [i.strip() for i in row.phenotypes.split(";")]
    if pd.notna(row.publications):
        document["association"]["pmids"] = [i.strip() for i in row.publications.split(";")]
    if pd.notna(row.comments):
        document["association"]["curator_comments"] = row.comments
     
    ## Disease
    ## disease_mim: create field depending on whether OMIM or orphanet    
    if pd.notna(row.disease_mim):
        if row.disease_mim.startswith("orphanet"):
            document["object"]["orphanet"] = row.disease_mim
        elif row.disease_mim.startswith("OMIM"):
            document["object"]["omim"] = row.disease_mim
    if pd.notna(row.disease_MONDO):
        document["object"]["mondo"] = row.disease_MONDO
    
    documents.append(document)

## Checking documents

In [83]:
df[df["panel"].str.contains("Hearing", na=False)]

## look for single values
# df[~ df["publications"].str.contains(";", na=True)]


# df[df["publications"].isna()]
# df[df["cross_cutting_modifier"].notna()]

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url
3159,G2P03582,MYO6,OMIM:600970,HGNC:7605,DFNA22; DFNB37; KIAA0389,MYO6-related nonsyndromic genetic hearing loss,,,biallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,missense_variant; stop_gained; frameshift_vari...,undetermined,inferred,,,18348273; 23485424; 25999546; 12687499; 24105371,Hearing loss,,2024-11-28 14:52:17+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03582
3160,G2P03583,MYO6,OMIM:600970,HGNC:7605,DFNA22; DFNB37; KIAA0389,MYO6-related nonsyndromic genetic hearing loss,,,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,missense_variant; stop_gained; frameshift_vari...,undetermined,inferred,,,18348273; 23485424; 25999546; 24105371,Hearing loss,,2024-11-28 14:47:17+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03583


In [None]:
pprint(documents[3642])

# documents[416]

## Parser notes

Fine to use raise/assert in parser (raise is technically better programming behavior: https://realpython.com/python-assert-statement/#understanding-common-pitfalls-of-assert)


My notes on parser:
* adding prefixes to gene/disease IDs is good for pre-NodeNorming steps
* keeping diff gene/disease ID namespaces as separate fields right now is good for current BTE/x-bte-annotation system


My notes on syntax:
* use `yield` when you want to "return" within a "for loop" (return only happen once, then exit for-loop/function execution)
  * that's what it's used in main execution, when you're iterating over csv rows to generate documents
* use `yield from {function}` to get the data from a generator (created by `yield` being used the function)

## Pre-NodeNorming

Notes

Querying NodeNorm: better to send unique values from entire column (can do in large batches) -> generate mapping dict to use
* Querying NodeNorm 1-by-1 or 1 row at a time is too slow, and would involve sending duplicate IDs (unless saved dict is kept outside loop and checked) 

Not going to use NameResolver. Not optimistic this would work anyways. My manual process of getting "better" disease IDs is to use the gene IDs, find the diseases they're linked to in OMIM and Monarch, and seeing if those match the data's disease name / phenotypes / pmids. 

In [11]:
## put into parser (format): DONE

import requests

## from BioThings annotator code: for interoperability between diff Python versions
# try:
#     from itertools import batched  # new in Python 3.12
# except ImportError:
#     from itertools import islice

#     def batched(iterable, n):
#         # batched('ABCDEFG', 3) → ABC DEF G
#         if n < 1:
#             raise ValueError("n must be at least one")
#         iterator = iter(iterable)
#         while batch := tuple(islice(iterator, n)):
#             yield batch

## doing to test that this works
from itertools import islice

def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch

nodenorm_url = "https://nodenorm.ci.transltr.io/get_normalized_nodes"

In [12]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3647 entries, 0 to 3646
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   g2p_id                              3647 non-null   object             
 1   gene_symbol                         3647 non-null   object             
 2   gene_mim                            3645 non-null   object             
 3   hgnc_id                             3647 non-null   object             
 4   previous_gene_symbols               3277 non-null   object             
 5   disease_name                        3647 non-null   object             
 6   disease_mim                         2570 non-null   object             
 7   disease_MONDO                       561 non-null    object             
 8   allelic_requirement                 3647 non-null   object             
 9   cross_cutting_modifier              451 n

In [13]:
df.head()

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url
0,G2P00124,KCNE1,OMIM:176261,HGNC:6240,ISK; JLNS2; LQT5; MINK,KCNE1-related Jervell and Lange-Nielsen syndrome,OMIM:612347,,biallelic_autosomal,potential secondary finding,strong,altered gene product structure,missense_variant; inframe_deletion; stop_gaine...,undetermined,inferred,,HP:0000407; HP:0001657; HP:0000007; HP:0001279,30461122,Developmental disorders; Cardiac disorders,KCNE1-related JLNS is due to altered gene prod...,2024-04-05 12:05:01+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00124
1,G2P00841,PTPN11,OMIM:176876,HGNC:9644,BPTP3; NS1; PTP2C; SH-PTP2; SHP-2; SHP2,PTPN11-related Noonan syndrome with multiple l...,OMIM:151100,,monoallelic_autosomal,,definitive,altered gene product structure,missense_variant; inframe_deletion; inframe_in...,undetermined,inferred,,HP:0000325; HP:0002996; HP:0000957; HP:0001709...,27484170; 26377839; 25917897; 25884655; 248207...,Developmental disorders; Skin disorders; Cardi...,Expert review done on 12/01/2022; Noonan syndr...,2025-01-21 14:56:43+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00841
2,G2P03247,DSC2,OMIM:125645,HGNC:3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac disorders,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:36:09+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03247
3,G2P03248,DSC2,OMIM:125645,HGNC:3036,CDHF2; DSC3,DSC2-related arrhythmogenic right ventricular ...,,MONDO:0012506,biallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; splice_region_variant; misse...,undetermined,inferred,,,31028357; 23911551; 21636032; 33831308; 263105...,Cardiac disorders,Expert review done on 05/01/2022; DSC2-related...,2024-03-20 09:35:19+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03248
4,G2P03249,DSG2,OMIM:125671,HGNC:3049,CDHF5,DSG2-related arrhythmogenic right ventricular ...,,MONDO:0012434,monoallelic_autosomal,,definitive,decreased gene product level; altered gene pro...,inframe_deletion; missense_variant; stop_gaine...,undetermined,inferred,,,21636032; 33831308; 33917638; 34400560; 240707...,Cardiac disorders,Expert review done on 05/01/2022; DSG2-related...,2024-03-20 09:40:18+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03249


### Genes

Method #2: build a mapping dict from unique values

#### HGNC IDs

In [14]:
## put into parser (format): DONE

gene_hgnc_curies = df["hgnc_id"].dropna().unique()
len(gene_hgnc_curies)

2991

In [15]:
## put into parser (format): 

gene_nodenorm_mapping = {}

## larger batches are quicker
for batch in batched(gene_hgnc_curies, 1000):
    ## returns tuples
    req_body = {
        "curies": list(batch),
        "conflate": True,
    }
    r = requests.post(nodenorm_url, json=req_body)
    response = r.json()
    
    temp = {
        k: {"primary_id": v["id"]["identifier"],
            "primary_label": v["id"]["label"],
            "type": v["type"][0]} 
        for k,v in response.items()
    }
    gene_nodenorm_mapping.update(temp)

In [16]:
len(gene_nodenorm_mapping)

2991

#### OMIM IDs

In [17]:
gene_omim_curies = df["gene_mim"].dropna().unique()
len(gene_omim_curies)

2989

In [18]:
gene_IDs_no_label = []
gene_IDs_wrong_category = []
gene_IDs_not_recognized = []

In [19]:
## larger batches are quicker
for batch in batched(gene_omim_curies, 1000):
    ## returns tuples
    req_body = {
        "curies": list(batch),
        "conflate": True,
    }
    r = requests.post(nodenorm_url, json=req_body)
    response = r.json()
    
    ## not doing dict comprehension. allows easier review, logic writing
    for k,v in response.items():
        try:
            ## if NodeNorm did not recognize ID, v will be None
            if v is not None:
                ## some IDs aren't Gene, throw those mappings out 
                if v["type"][0] == "biolink:Gene":
                    ## also throw out mapping if no primary label found
                    if v["id"].get("label"):
                        temp = {
                            k: {"primary_id": v["id"]["identifier"],
                                "primary_label": v["id"]["label"]
                               }
                        }
                        gene_nodenorm_mapping.update(temp)
                    else:
                        gene_IDs_no_label.append(k)
                        print(f"{k}: NodeNorm didn't find primary label. Not keeping this mapping.")
                else:
                    gene_IDs_wrong_category.append(k)
                    print(f'{k}: NodeNorm found different category {v["type"][0]}. Not keeping this mapping.')
            else:
                gene_IDs_not_recognized.append(k)
                print(f"{k}: NodeNorm didn't recognize this ID")
        except:
            print(f'Encountered an error processing the NodeNorm response.')
            print(f'NodeNorm response key: {k}')
            print(f'NodeNorm response value: {v}')

In [20]:
## from looking at 2025-03-28 data

response["OMIM:621003"]

KeyError: 'OMIM:621003'

In [21]:
len(gene_nodenorm_mapping)

5980

#### Comparing the two

In [22]:
## are all the NodeNorm main category Gene?
for k, v in gene_nodenorm_mapping.items():
    if v["type"] != "biolink:Gene":
        print(f'{k}: NodeNorm found different category {v["type"]}')
        
## nothing prints, so we're good there

KeyError: 'type'

In [23]:
## look for differences in primary ID found if row has both IDs
for row in df[["gene_mim", "hgnc_id"]].itertuples(index=False):
    if pd.notna(row.gene_mim) and pd.notna(row.hgnc_id):
        if gene_nodenorm_mapping.get(row.gene_mim) and \
        gene_nodenorm_mapping.get(row.hgnc_id):
            if gene_nodenorm_mapping[row.gene_mim]["primary_id"] != \
            gene_nodenorm_mapping[row.hgnc_id]["primary_id"]:
                print(row)

## nothing prints, so there are no mismatches

In [24]:
## look for differences in name between NodeNormed and original data

for row in df[["gene_symbol", "hgnc_id"]].itertuples(index=False):
    if row.gene_symbol != gene_nodenorm_mapping[row.hgnc_id]["primary_label"]:
        print(f"G2P name {row.gene_symbol}, ID {row.hgnc_id}")
        print(f'NodeNorm name {gene_nodenorm_mapping[row.hgnc_id]["primary_label"]}, ID {gene_nodenorm_mapping[row.hgnc_id]["primary_id"]}')
        print("\n")
        
## mismatched names
## NodeNorm is correct for CENPJ -> CPAP and CCDC103 -> DNAAF19
## something is odd for the others (mitochondrial genes) -> messaged NodeNorm

G2P name MT-TP, ID HGNC:7494
NodeNorm name TRNP, ID NCBIGene:4571


G2P name CENPJ, ID HGNC:17272
NodeNorm name CPAP, ID NCBIGene:55835


G2P name CCDC103, ID HGNC:32700
NodeNorm name DNAAF19, ID NCBIGene:388389


G2P name MT-TL1, ID HGNC:7490
NodeNorm name TRNL1, ID NCBIGene:4567


G2P name MT-ND1, ID HGNC:7455
NodeNorm name ND1, ID NCBIGene:4535


G2P name MT-ND4, ID HGNC:7459
NodeNorm name ND4, ID NCBIGene:4538


G2P name MT-ATP6, ID HGNC:7414
NodeNorm name ATP6, ID NCBIGene:4508


G2P name MT-ND5, ID HGNC:7461
NodeNorm name ND5, ID NCBIGene:4540


G2P name MT-ND6, ID HGNC:7462
NodeNorm name ND6, ID NCBIGene:4541




Notes: 
* All IDs were resolved by NodeNorm
* All IDs were resolved as Genes (main category) 
* When rows had both OMIM gene and HGNC IDs, there were no mismatches in resolved NodeNorm entity/primary ID

Plans:
* only use HGNC ID column:
  * no missing values
  * no NodeNorm mismatches using it vs OMIM
  * if there were mismatches, I was going to prefer this mapping over OMIM's anyways
* use mapping dict to create two new columns: gene_nodenorm_id, gene_nodenorm_label

In [None]:
## put into parser (format): 

df["gene_nodenorm_id"] = [gene_nodenorm_mapping[i]["primary_id"] for i in df["hgnc_id"]]
df["gene_nodenorm_label"] = [gene_nodenorm_mapping[i]["primary_label"] for i in df["hgnc_id"]]

In [None]:
## look at the gene ID/name info

gene_mapping_df = df[["gene_symbol", "gene_mim", "hgnc_id", "gene_nodenorm_id", "gene_nodenorm_label"]].copy()
gene_mapping_df[gene_mapping_df["gene_mim"].isna()]

In [None]:
## all NodeNormed IDs NCBIGene?
df[~ df["gene_nodenorm_id"].str.contains("NCBIGene:")]

## empty df: yes

### Diseases

#### OMIM/orphanet

In [25]:
## put into parser (format): 

disease_omim_orpha_curies = df["disease_mim"].dropna().unique()
len(disease_omim_orpha_curies)

2401

In [27]:
## put into parser (format): 

disease_nodenorm_mapping = {}
IDs_not_recognized = []
IDs_wrong_category = []
IDs_no_label = []


## larger batches are quicker
for batch in batched(disease_omim_orpha_curies, 1000):
    ## returns tuples
    req_body = {
        "curies": list(batch),
        "conflate": True,
    }
    r = requests.post(nodenorm_url, json=req_body)
    response = r.json()
    
    ## not doing dict comprehension. allows easier review, logic writing
    for k,v in response.items():
        try:
            ## if NodeNorm did not recognize ID, v will be None
            if v is not None:
                ## some IDs aren't Diseases, throw those mappings out 
                if v["type"][0] == "biolink:Disease":
                    ## also throw out mapping if no primary label found
                    if v["id"].get("label"):
                        temp = {
                            k: {"primary_id": v["id"]["identifier"],
                                "primary_label": v["id"]["label"]
                               }
                        }
                        disease_nodenorm_mapping.update(temp)
                    else:
                        IDs_no_label.append(k)
#                         print(f"{k}: NodeNorm didn't find primary label. Not keeping this mapping.")
                else:
                    IDs_wrong_category.append(k)
#                     print(f'{k}: NodeNorm found different category {v["type"][0]}. Not keeping this mapping.')
            else:
                IDs_not_recognized.append(k)
#                 print(f"{k}: NodeNorm didn't recognize this ID")
        except:
            print(f'Encountered an error processing the NodeNorm response.')
            print(f'NodeNorm response key: {k}')
            print(f'NodeNorm response value: {v}')
## old code: dict comprehension
#         temp = {
#             k: {"primary_id": v["id"]["identifier"],
#                 "primary_label": v["id"]["label"]} 
#             for k,v in response.items()
#             if v is not None if v["type"][0] == "biolink:Disease"
#         }
#         disease_nodenorm_mapping.update(temp)
#     except:
#         print(k)
#         pprint(v)

#### Reviewing OMIM NodeNorm mapping issues

In [28]:
len(disease_nodenorm_mapping)

print("Node Norm mapping issues:\n")
print(f"IDs not recognized: {len(IDs_not_recognized)}")
print(f"IDs wrong category: {len(IDs_wrong_category)}")
print(f"IDs no label: {len(IDs_no_label)}")

sum([len(IDs_not_recognized), len(IDs_wrong_category), len(IDs_no_label)])

print(f"Diff between unique values and mappings: {len(disease_omim_orpha_curies) - len(disease_nodenorm_mapping)}")

2333

Node Norm mapping issues:

IDs not recognized: 39
IDs wrong category: 26
IDs no label: 3


68

Diff between unique values and mappings: 68


39 cases where NodeNorm didn't recognize/resolve ID. I checked a few:
* 5: ID has been replaced/moved to a diff ID (OMIM:607236, OMIM:608890, OMIM:613180, OMIM:300706, OMIM:300141) -> TELL EBI GENE2PHENO
* 3: ID doesn't exist (OMIM:249163, OMIM:319029, OMIM:237145) -> TELL EBI GENE2PHENO
* 1: NodeNorm error - this is a valid disease ID that it should recognize (OMIM:133700) -> messaged NodeNorm
* 1: valid ID, but it doesn't seem to be a disease. There may be better IDs out there (OMIM:601884) -> TELL EBI GENE2PHENO, messaged NodeNorm

26 cases where NodeNorm category was something else (was always Gene instead). I checked all: 
* 25: NodeNorm is correct, this is a gene -> TELL EBI GENE2PHENO
* 1: NodeNorm error - this is a valid disease ID (OMIM:188400) -> messaged NodeNorm

3 cases where NodeNorm didn't have a primary label. I checked all:
* 2: NodeNorm error - these are valid disease IDs with labels (OMIM:620987, OMIM:620964) -> messaged NodeNorm
* 1: valid ID, but it doesn't seem to be a disease. EBI gene2pheno shouldn't use, not sure it should be in NodeNorm -> TELL EBI GENE2PHENO, messaged NodeNorm

In [29]:
# IDs_not_recognized[5:10]
# df[df["disease_mim"] == "OMIM:613180"]

# "OMIM:188400" in IDs_wrong_category

IDs_no_label

['OMIM:300129', 'OMIM:620987', 'OMIM:620964']

No orphanet IDs were flagged but I checked all 9 anyways. The mappings look fine. 

In [None]:
df[df["disease_mim"].str.contains("orphanet", na=False)]

In [None]:
disease_nodenorm_mapping["orphanet:1332"]

Decided not to try using MONDO mappings when OMIM mapping failed. There's only a few cases where IDs have MONDO

IDs_not_recognized (39): none have MONDO
IDs_wrong_category (26): only 2 have MONDO
IDs_no_label (3): none have MONDO

In [146]:
df[df["disease_mim"].isin(IDs_not_recognized) & df["disease_MONDO"].notna()]

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url


In [None]:
## put into parser (format): 

## using this kind of logic to create columns for function

# [disease_nodenorm_mapping[i]["primary_label"] if disease_nodenorm_mapping.get(i) 
#  else pd.NA for i in df["disease_mim"]]

#### MONDO

In [33]:
## put into parser (format): 

disease_mondo_curies = df["disease_MONDO"].dropna().unique()
len(disease_mondo_curies)

380

In [34]:
## put into parser (format): 

## larger batches are quicker
for batch in batched(disease_mondo_curies, 1000):
    ## returns tuples
    req_body = {
        "curies": list(batch),
        "conflate": True,
    }
    r = requests.post(nodenorm_url, json=req_body)
    response = r.json()
    
    ## not doing dict comprehension. allows easier review, logic writing
    for k,v in response.items():
        try:
            ## if NodeNorm did not resolve ID, v will be None
            if v is not None:
                ## some IDs aren't Diseases o_0, throw them out
                ## also throw out mapping if no primary label found
                if v["type"][0] == "biolink:Disease":
                    if v["id"].get("label"):
                        temp = {
                            k: {"primary_id": v["id"]["identifier"],
                                "primary_label": v["id"]["label"]
                               }
                        }
                        disease_nodenorm_mapping.update(temp)
                    else:
                        print(f"{k}: NodeNorm didn't find primary label")
                else:
                    print(f'{k}: NodeNorm category is {v["type"][0]}')
            else:
                print(f"{k}: NodeNorm didn't resolve")
        except:
            print(k)
            pprint(v)
## old code: dict comprehension
#         temp = {
#             k: {"primary_id": v["id"]["identifier"],
#                 "primary_label": v["id"]["label"]} 
#             for k,v in response.items()
#             if v is not None if v["type"][0] == "biolink:Disease"
#         }
#         disease_nodenorm_mapping.update(temp)
#     except:
#         print(k)
#         pprint(v)

In [35]:
len(disease_nodenorm_mapping)

2713

#### Mismatched NodeNorm mappings

In [36]:
## look for differences in NodeNorm primary ID found if row has both IDs

mismatch_tuples = []

for row in df[["disease_mim", "disease_MONDO"]].itertuples(index=False):
    if pd.notna(row.disease_mim) and pd.notna(row.disease_MONDO):
        if disease_nodenorm_mapping.get(row.disease_mim):
            if disease_nodenorm_mapping[row.disease_mim]["primary_id"] != \
            disease_nodenorm_mapping[row.disease_MONDO]["primary_id"]:
                mismatch_tuples.append((row.disease_mim, row.disease_MONDO))
                
len(mismatch_tuples)
## so...there's some mismatches

22

In [37]:
mismatch_tuples[21]

('OMIM:175800', 'MONDO:0006602')

In [None]:
disease_nodenorm_mapping["OMIM:175800"]
disease_nodenorm_mapping["MONDO:0006602"]

In [None]:
df[df["disease_mim"] == "OMIM:175800"]
df[df["disease_MONDO"] == "MONDO:0006602"]

##### Reviewing the 22 mismatched pairs:


**19: OMIM's mapping is better**

*6: Mondo ID is related but wrong* -> TELL EBI GENE2PHENO
* 'OMIM:243310', 'MONDO:0013812': omim is correct syndrome 1, but mondo is syndrome 2 (diff gene)
* 'OMIM:613575', 'MONDO:0044314': omim is correct 55, but mondo is 78 (diff gene)
* 'OMIM:101000', 'MONDO:0008075': omim is correct type of schwannomatosis (NF2/type 2), vs mondo is a sibling. 
  * NodeNorm should map omim to MONDO:0007039 but isn't -> messaged NodeNorm
* 'OMIM:613987', **'MONDO:0009136'**: omim is correct recessive 2, but mondo is recessive 1 (mondo's Monarch page does link to gene NHP2. But corresponding OMIM:224230 doesn't)
  * NodeNorm should map omim to MONDO:0013519 but isn't -> messaged NodeNorm  
* 'OMIM:613988', 'MONDO:0009136': omim is correct recessive 3, but mondo is recessive 1 (diff gene)
  * NodeNorm should map omim to MONDO:0013520 but isn't -> messaged NodeNorm
* 'OMIM:616353', 'MONDO:0009136': omim is correct recessive 6, but mondo is recessive 1 (diff gene)
  * NodeNorm should map omim to MONDO:0014600 but isn't -> messaged NodeNorm

*13: Mondo ID is too general* -> TELL EBI GENE2PHENO?
* 'OMIM:300696', 'MONDO:0010680': omim actually maps to MONDO:0010401, child of the mondo (can see on Monarch website)
* 'OMIM:304120', 'MONDO:0019027': omim actually maps to MONDO:0010571 (syndrome type 2), child of the mondo (syndrome) (can see on Monarch website)
* 'OMIM:610019', 'MONDO:0005129': omim actually maps to MONDO:0012395 (cataract 18), child of the mondo (cataract) (can see on Monarch website)
* 'OMIM:611726', 'MONDO:0016295': omim actually maps to MONDO:0012721, child of the mondo  (can see on Monarch website)
* 'OMIM:602668', 'MONDO:0016107': omim actually maps to MONDO:0011266 (type 2), child of the mondo (can see on Monarch website)
* 'OMIM:203200', 'MONDO:0018910': omim actually maps to MONDO:0008746 (type 2), child of the mondo (can see on Monarch website)
* 'OMIM:614328', 'MONDO:0017411': omim actually maps to MONDO:0013693 (1), child of the mondo (can see on Monarch website)
* 'OMIM:175800', 'MONDO:0006602': omim actually maps to MONDO:0008290 (1, mibelli type), grandchild of the mondo (can see on Monarch website)
* 'OMIM:614073', **'MONDO:0019312'**: omim actually maps to MONDO:0013556 (syndrome 4), child of the mondo (syndrome) (can see on Monarch website)
* 'OMIM:614074', 'MONDO:0019312': omim actually maps to MONDO:0013557 (syndrome 5), child of the mondo (syndrome) (can see on Monarch website)
* 'OMIM:614075', 'MONDO:0019312': omim actually maps to MONDO:0013558 (syndrome 6), child of the mondo (syndrome) (can see on Monarch website)
* 'OMIM:614076', 'MONDO:0019312': omim actually maps to MONDO:0013559 (syndrome 7), child of the mondo (syndrome) (can see on Monarch website)
* 'OMIM:614077', 'MONDO:0019312': omim actually maps to MONDO:0013560 (syndrome 8), child of the mondo (syndrome) (can see on Monarch website)


**1: MONDO's mapping is better**
* omim ID is slightly off -> TELL EBI GENE2PHENO?
  * 'OMIM:613723', 'MONDO:0009181': mondo matches the disease name and phenotypes listed in the record better than the omim 


**1: Unsure**
* 'OMIM:158350', 'MONDO:0017623': omim is for Cowden syndrome 1, mondo is for PTEN hamartoma tumor syndrome. These are very similar, so I'm not sure which one is better. -> TELL EBI GENE2PHENO?
  * There's also another record w/ just the OMIM ID. I think the two rows should be merged. -> TELL EBI GENE2PHENO?


**1: NodeNorm error -> messaged NodeNorm**
* 'OMIM:224230', 'MONDO:0009136': both are recessive 1, NodeNorm should map to same entity

**Other rows reviewed:**
* 'OMIM:614583', 'MONDO:0013812': map to same entity, both correct

#### Reviewing MONDO IDs for reliability: when there isn't a OMIM ID to use

In [139]:
df_disease_mondo_only = df[df["disease_mim"].isna() & df["disease_MONDO"].notna()].copy()

mondo_only = df_disease_mondo_only["disease_MONDO"].dropna().unique()

In [142]:
df_disease_mondo_only.shape

len(mondo_only)

(444, 22)

277

In [137]:
df_disease_mondo_only[df_disease_mondo_only["disease_MONDO"] == mondo_only[250]]

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url
3433,G2P02161,MAFB,OMIM:608968,HGNC:6408,KRML,MAFB-related Duane retraction syndrome,,MONDO:0007473,monoallelic_autosomal,,strong,absent gene product,,loss of function,inferred,,,,Eye disorders,,2017-08-29 09:09:13+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P02161
3643,G2P03089,MAFB,OMIM:608968,HGNC:6408,KRML,MAFB-related Duane retraction syndrome,,MONDO:0007473,monoallelic_autosomal,restricted mutation set,strong,altered gene product structure,,dominant negative,inferred,,,,Eye disorders,,2021-01-12 15:01:40+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03089


In [133]:
df_disease_mondo_only[df_disease_mondo_only["panel"].str.contains("Skeletal", na=False)]

Unnamed: 0,g2p_id,gene_symbol,gene_mim,hgnc_id,previous_gene_symbols,disease_name,disease_mim,disease_MONDO,allelic_requirement,cross_cutting_modifier,confidence,variant_consequence,variant_types,molecular_mechanism,molecular_mechanism_categorisation,molecular_mechanism_evidence,phenotypes,publications,panel,comments,date_of_last_review,g2p_record_url
82,G2P00021,COL1A1,OMIM:120150,HGNC:2197,OI4,COL1A1-related osteogenesis imperfecta spectrum,,,monoallelic_autosomal,restricted mutation set,definitive,altered gene product structure,,dominant negative,inferred,,HP:0003502; HP:0000707; HP:0000325; HP:0000923...,9295084; 3082886; 18409203; 2295701; 1988452; ...,Developmental disorders; Skin disorders; Skele...,,2025-01-15 11:51:09+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00021
218,G2P00872,PIK3CA,OMIM:171834,HGNC:8975,PI3K,PIK3CA-related overgrowth spectrum disorder wi...,,,monoallelic_autosomal,typically mosaic; restricted mutation set,definitive,altered gene product structure,,gain of function,inferred,,HP:0008678; HP:0010301; HP:0009748; HP:0001355...,22658544; 22729224,Developmental disorders; Skeletal disorders,,2024-12-11 11:40:22+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P00872
399,G2P02554,PRRX1,OMIM:167420,HGNC:9142,PHOX1; PMX1,PRRX1-related craniosynostosis,,,monoallelic_autosomal,,moderate,decreased gene product level; altered gene pro...,frameshift_variant; missense_variant; stop_gained,undetermined,inferred,,HP:0001363,37154149,Developmental disorders; Skeletal disorders,,2024-03-22 10:30:40+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P02554
400,G2P02609,MYH3,OMIM:160720,HGNC:7573,HEMHC; MYHC-EMB; MYHSE1; SMHCE,MYH3-related spondylocarpotarsal synostosis sy...,,,biallelic_autosomal,,strong,absent gene product,,loss of function,inferred,,,29805041,Developmental disorders; Skeletal disorders,,2025-01-28 18:17:32+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P02609
413,G2P03497,POP1,OMIM:602486,HGNC:30129,,POP1-related anauxetic dysplasia,,MONDO:0011773,biallelic_autosomal,,definitive,decreased gene product level,,undetermined,inferred,,,21455487; 27380734; 28067412,Skeletal disorders,,2023-12-20 09:04:04+00:00,https://www.ebi.ac.uk/gene2phenotype/lgd/G2P03497


Look at database subset that doesn't have disease OMIM ID: 
there are 277 unique MONDO IDs for 444 rows.

Reviewed 29 unique MONDO IDs (>10%), from all panels (only 1 skeletal, no hearing) -> (3 + 0-240 sets of 10 idx, plus skeletal)

Reviewed 37 rows: 6 were wrong (~16%) 

18 Great:
* "MONDO:0012506" for "DSC2-related arrhythmogenic right ventricular cardiomyopathy"
* "MONDO:0011001" for "SCN5A-related Brugada syndrome"
* "MONDO:0013262" for "MYH7-related dilated cardiomyopathy"
* "MONDO:0013369" for "TNNI3-related hypertrophic cardiomyopathy"
* "MONDO:0010946" for "PRKAG2-related cardiomyopathy"
* "MONDO:0014143" for "RIT1-related Noonan syndrome"
* "MONDO:0010015" for "PXDN-related anterior segment dysgenesis with sclerocornea"
* "MONDO:0014214" for "DYNC2I1-related short-rib polydactyly"
* "MONDO:0013522" for "TINF2-related dyskeratosis congenita"
* "MONDO:0032876" for "WASF1-related intellectual disability with seizures"
* "MONDO:0859164" for "UNC45A-related osteootohepatoenteric syndrome"
* "MONDO:0018772" for "SLC30A7-related Joubert syndrome": using general term is fine since there isn't any established subtype of Joubert syndrome for this gene
* "MONDO:0010215" for "ERCC4-related xeroderma pigmentosum, group F"
* "MONDO:0009735" for "SPINK5-related Netherton syndrome"
* "MONDO:0007808" for "KRT1-related ichthyosis hystrix, Curth-Macklin type"
* "MONDO:0007566" for "TGFBR1-related multiple self-healing squamous epithelioma"
* "MONDO:0008285" for "PDGFRA-related gastrointestinal stromal tumor/GIST-plus syndrome, somatic or familial"
* "MONDO:0010912" for "TUBB3-related fibrosis of extraocular muscles, congenital"



5 Okay (using general term is fine):  
* "MONDO:0005129" for 3 other rows "WDR87-related congenital cataract", "AKR1E2-", "MFSD6L-": couldn't find better mappings. 
* "MONDO:0015469" for "DHRS3 related craniosynostosis": couldn't find better mapping
* "MONDO:0024676" (childhood kidney Wilms tumor) for "CTR9-related Wilms tumour", "TRIM28-": couldn't find better mapping. TRIM28 is correlated to parent term (kidney Wilms tumor). 


4 Unsure:
* "MONDO:0005129" for "CYP51A1-related congenital cataract": mondo is cataract, which is not wrong but kinda general. MONDO:0033853 seems better (correlated with gene, matches phenotypes, orphanet ref uses one of the ref papers) -> TELL EBI GENE2PHENO?
* "MONDO:0018869" for "TMTC3-related cobblestone lissencephaly": while the mondo (cobblestone lissencephaly) sounds correct, it isn't linked to this gene. VS another sibling disease is linked to the gene, matches phenotypes, uses same paper: MONDO:0014992/OMIM:617255 (lissencephaly 8)
* "MONDO:0100100" for "SELENON-related myopathy": while mondo has exact name match, it's not directly linked to gene. Instead, its child disease is directly linked to gene MONDO:0011271/OMIM:602771 (rigid spine muscular dystrophy 1)
* "MONDO:0020367" for "MYOC-related juvenile open angle glaucoma": while mondo is almost-exact name match, it's not directly linked to gene. Instead, its child disease is directly linked to gene MONDO:0007664/OMIM:137750 (glaucoma 1, open angle, A) 



4 MONDO is too general:
* "MONDO:0020341" (periventricular nodular heterotopia) for "ERMARD-related periventricular heterotopia". The ERMARD-specific version is a child term: MONDO:0014240/OMIM:615544 (type 6)
* "MONDO:0018965" (Alport syndrome) for "COL4A5-related Alport syndrome". The COL4A5-specific version is a child term: MONDO:0010520/OMIM:301050  (X-linked)
* "MONDO:0024676" for "REST-related Wilms tumour": The REST-specific version is a related term: MONDO:0014779/OMIM:616806 (type 6)
* "MONDO:0011773" for "POP1-related anauxetic dysplasia": the POP1-specific version is a child term: MONDO:0054561/OMIM:617396 (type 2)


6 MONDO is related but wrong:
* "MONDO:0009136" for "RTEL1-related dyskeratosis congenita" (two rows): mondo is recessive 1, which is wrong. Should be recessive 5 MONDO:0014076/OMIM:615190 (old/synonym name is dominant 4) -> TELL EBI GENE2PHENO?
* "MONDO:0044314" for 4 rows "CLN3-related retinal dystrophy", "GUCA1B-", "PRPS1-", "SNRNP200-": mondo is type 78 (specifically for ARHGEF18), which is wrong. Should instead be:
  * CLN3 and PRPS1: a more general term like MONDO:0004580 (retinal degeneration) -> MONDO:0019118 (inherited retinal dystrophy) -> MONDO:0019200 (retinitis pigmentosa)
  * GUCA1B: type 48, MONDO:0013447
  * SNRNP200: type 33, MONDO:0012477
* "MONDO:0013522" for "TERC-related dyskeratosis congenita": mondo is for type 3 (specifically for TINF2, see above in "Great" section). Should be type 1 MONDO:0007485/OMIM:127550. (Monarch's page of type 1 includes TINF2 and TERT too, but OMIM page only includes TERC)

#### Disease decisions so far

* use OMIM IDs
  * NodeNorm mapping failed for a few
* if row/record has both IDs, use OMIM ID




Larger picture:

* OMIM: small percentage affected. (2401 - 9 orphanet = 2392 unique OMIM disease IDs)
  * ~2.8% = 68/2392 NodeNorm mapping failed 
  * plus 1 used but unreliable ("MONDO's mapping is better" mismatch categories)

Current plan:

* use logic in omim section for printing issues with mapping, only saving some mappings
* only create nodenorm mapping if it existed. Saving input ID that was used for mapping.


In [None]:
df[(df["disease_mim"].isna()) and (df['disease_MONDO'].notna())]

### older code chunks

#### Method 1 (row-wise, will send duplicates)

In [None]:
node_normed_gene_id = []
node_normed_gene_name = []
tally = 0

for row in df.itertuples(index=False):
    ## collect non-NA gene IDs for a row
    gene_curies = [i for i in [row.gene_mim, row.hgnc_id] if pd.notna(i)]
    
    ## gene_curies aren't empty
    if gene_curies:
        req_body = {
            "curies": gene_curies,
            "conflate": True
        }
        
        r = requests.post(nodenorm_url, json=req_body)
        response = r.json()
        
        if len(gene_curies) == 2:
            if response[gene_curies[0]]["id"]["identifier"] != \
            response[gene_curies[1]]["id"]["identifier"]:
                print(gene_curies)
            ## take HGNC ID always
            node_normed_gene_id.append(response[gene_curies[1]]["id"]["identifier"])
            node_normed_gene_name.append(response[gene_curies[1]]["id"]["label"])
        else:
            ## HGNC ID should be first
            node_normed_gene_id.append(response[gene_curies[0]]["id"]["identifier"])
            node_normed_gene_name.append(response[gene_curies[0]]["id"]["label"])
    
    tally += 1
    if tally % 10 == 0:
        print(tally)

In [None]:
## collect non-NA gene IDs for a row

row_gene_curies = [i for i in [df.loc[0, "gene_mim"], df.loc[0, "hgnc_id"]] if pd.notna(i)]

In [None]:
row_gene_curies

In [None]:
## aka not empty
if row_curies:
    parameters = {
        ## use gene curies from 1 row
        "curie": row_gene_curies,
        "conflate": True
    }
    r = requests.get(nodenorm_url, params=parameters)
    response = r.json()

In [None]:
response.keys()

In [None]:
if response[row_gene_curies[0]]["id"]["identifier"] != \
   response[row_gene_curies[1]]["id"]["identifier"]:
    print(row_gene_curies)

In [None]:
## NodeNormed primary/canonical ID and name
response["OMIM:176261"]["id"]
response["HGNC:6240"]["id"]

In [None]:
if response[row_omim]:
    print("yay")

In [None]:
## doesn't include NA by default
df["gene_symbol"].nunique()
df["gene_mim"].nunique()
df["hgnc_id"].nunique()

## ...so some repetitiveness to doing things row-wise. 