# OMIM annotation of constrained genes

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
def read_gm(path):
    """Read genemap2.txt file into memory"""
    gm = pd.read_csv(
        path,
        sep="\t",
        comment="#",
        header=None,
        names=gm_columns,
    )
    return gm

In [3]:
def
gm["phenotype_string"] = gm.phenotype_string.str.split(";")
gm = gm.explode("phenotype_string", ignore_index=True)
gm = gm.dropna(subset="phenotype_string")  # Drop entries with no associated phenotype

print(f"Number of gene-phenotype pairs: {len(gm)}")

Number of gene-phenotype pairs: 8628


In [4]:
re_long = r"^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$"
re_short = r"^(.*)\((\d)\)(|, (.*))$"


def match_re(string, _re=re_long):
    string = string.strip()
    return re.match(_re, string)

In [5]:
pheno_long = gm.phenotype_string.apply(match_re, _re=re_long).dropna().to_frame()

print(f"Number of long phenotype entries: {len(pheno_long)}")

Number of long phenotype entries: 8516


In [6]:
pheno_long["phenotype"] = pheno_long.phenotype_string.apply(lambda x: x.group(1))
pheno_long["inheritance"] = pheno_long.phenotype_string.apply(
    lambda x: x.group(5)
).str.split(", ")
pheno_long = pheno_long.explode("inheritance").drop("phenotype_string", axis=1)

print(f"Number of long phenotypes with distinct inheritance: {len(pheno_long)}")
print(
    f"Number of phenotypes lacking an inheritance annotation: {pheno_long.inheritance.isna().sum()}"
)

Number of long phenotypes with distinct inheritance: 8883
Number of phenotypes lacking an inheritance annotation: 1309


In [7]:
pheno_short = (
    gm.loc[~gm.index.isin(pheno_long.index), "phenotype_string"]
    .apply(match_re, _re=re_short)
    .to_frame()
)

print(f"Number of short phenotype entries: {len(pheno_short)}")

pheno_short["phenotype"] = pheno_short.phenotype_string.apply(lambda x: x.group(1))
pheno_short["inheritance"] = pheno_short.phenotype_string.apply(lambda x: x.group(3))
pheno_short = pheno_short.drop("phenotype_string", axis=1)

print(
    f"Number of short entries lacking an inheritance annotation {(pheno_short.inheritance=='').sum()}"
)

pheno_short = pheno_short.replace("", np.nan)

Number of short phenotype entries: 112
Number of short entries lacking an inheritance annotation 112


Entries with short phenotype strings (in pheno_short, above) are always missing an inheritance annotation. This is even true where the inheritance is explicitly given in the phenotype (e.g. "Deafness, autosomal recessive" is a phenotype, but has no associated inheritance annotation).

This is a striking limitation of the OMIM data.

In [8]:
pheno = pd.concat([pheno_long, pheno_short])

In [9]:
print(f"Number of non-disease phenotypes {pheno.phenotype.str.startswith('[').sum()}")
print(
    f"Number of susceptibility phenotypes {pheno.phenotype.str.startswith('{').sum()}"
)
print(f"Number of provisional phenotypes {pheno.phenotype.str.startswith('?').sum()}")

print("\nValue counts of inheritance modes:")
pheno.inheritance.value_counts()

Number of non-disease phenotypes 159
Number of susceptibility phenotypes 794
Number of provisional phenotypes 645

Value counts of inheritance modes:


Autosomal recessive          3610
Autosomal dominant           2758
X-linked recessive            220
X-linked                       76
Somatic mutation               74
X-linked dominant              70
Multifactorial                 52
Digenic dominant               18
Digenic recessive              17
Isolated cases                  8
Somatic mosaicism               5
?Autosomal dominant             4
Y-linked                        3
Pseudoautosomal dominant        2
Pseudoautosomal recessive       2
Mitochondrial                   1
Name: inheritance, dtype: int64

There are more genes associated with autosomal recessive disorders than autosomal dominant disorders.

The X-linked disorders are split between recessive and dominant.

There is only one disorder with mitochondrial inheritance!

In [None]:
def main():
    gm_path = "../data/genemap2.txt"
    gm_columns = [
        "chr",
        "start",
        "end",
        "cyto_loc",
        "calc_cyto_loc",
        "mim",
        "symbol",
        "gene_name",
        "approved_symbol",
        "entrez",
        "ensg",
        "comment",
        "phenotype_string",
        "mouse",
    ]
    read_gm(gm_path).pipe()