# OMIM annotation of constrained genes

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
gm_columns = [
    "chr",
    "start",
    "end",
    "cl",
    "ccl",
    "mim",
    "symbol",
    "gene_name",
    "approved_symbol",
    "entrez",
    "ensg",
    "comment",
    "phenotype",
    "mouse",
]

gm = pd.read_csv(
    "../data/genemap2.txt",
    sep="\t",
    comment="#",
    header=None,
    names=gm_columns,
    usecols=["ensg", "approved_symbol", "mim", "phenotype"],
)

In [3]:
gm["phenotype"] = gm.phenotype.str.split(";")
gm = gm.explode("phenotype", ignore_index=True)

gm = gm.dropna(subset=["ensg", "phenotype"])

In [4]:
re_long = r"^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$"
re_short = r"^(.*)\((\d)\)(|, (.*))$"

def match_re(string, _re):
    string = string.strip()
    return re.match(_re, string)

In [5]:
pheno_long = gm.phenotype.apply(match_re, _re=re_long).dropna().to_frame()

pheno_long["phen"] = pheno_long.phenotype.apply(lambda x: x.group(1))
pheno_long["inheritance"] = pheno_long.phenotype.apply(lambda x: x.group(5))

In [6]:
pheno_short = (
    gm.loc[~gm.index.isin(pheno_long.index), "phenotype"]
    .apply(match_re, _re=re_short)
    .to_frame()
)
pheno_long["phen"] = pheno_long.phenotype.apply(lambda x: x.group(1))
pheno_long["inheritance"] = pheno_long.phenotype.apply(lambda x: x.group(5))

print(f"Number of gene-phenotype pairs: {len(gm)}")
print(f"Number of long phenotype entries: {len(pheno_long)}")
print(f"Number of short phenotype entries: {len(pheno_short)}")

Number of gene-phenotype pairs: 7420
Number of long phenotype entries: 7315
Number of short phenotype entries: 105
