In [54]:
"""Create regional constraint TSV ready for export to Excel."""

import pandas as pd

import src

FILE_REGIONAL_CONSTRAINT = "data/final/regional_nonsense_constraint.tsv"
FILE_GENE_IDS = "data/interim/gene_ids.tsv"
FILE_SHET = "data/raw/genebayes_shet_estimates.tsv"
FILE_OMIM = "data/interim/genemap2_simple.tsv"
FILE_OUT = "data/final/regional_nonsense_constraint_for_excel_wide.tsv"

logger = src.logger


def read_constraint(path=FILE_REGIONAL_CONSTRAINT):
    df = pd.read_csv(path, sep="\t")

    logger.info(f"Constraint entries: {len(df)}")
    logger.info(f"Unique ENSTs in constraint: {df.enst.nunique()}")

    return df


def read_gene_ids(path=FILE_GENE_IDS):
    df = pd.read_csv(path, sep="\t", names=["ensg", "enst", "symbol"])

    logger.info(f"Gene IDs shape: {df.shape}")
    logger.info(f"Unique ENSGs: {df.ensg.nunique()}")
    logger.info(f"Unique symbols: {df.symbol.nunique()}")

    return df


def read_shet(path=FILE_SHET):
    df = pd.read_csv(path, sep="\t", usecols=["ensg", "post_mean"]).set_axis(
        ["ensg", "shet_post_mean"], axis=1
    )
    logger.info(f"shet shape: {df.shape}")
    logger.info(f"Unique ENSGs in shet: {df.ensg.nunique()}")
    return df


def read_omim(path=FILE_OMIM):
    df = pd.read_csv(path, sep="\t")
    logger.info(f"OMIM data shape: {df.shape}")
    logger.info(f"Unique ENSGs in OMIM data: {df.ensg.nunique()}")
    return df


def implode_omim(df):
    df = (
        df.groupby("ensg")
        .agg(
            omim_phenotype=("phenotype", ";".join),
            omim_inheritance=("inheritance", ";".join),
        )
        .reset_index()
    )

    logger.info(f"Shape: {df.shape}")
    logger.info(f"Unique ENSGs: {df.ensg.nunique()}")

    return df


def merge_gene_ids(left, right):
    df = left.merge(right, how="inner", validate="many_to_one")
    logger.info(f"Merged data shape: {df.shape}")
    logger.info(f"Unique ENSTs after merge: {df.enst.nunique()}")
    logger.info(
        f"Duplicates by ENSG and region: {df.duplicated(['enst','region']).sum()}"
    )
    logger.info(
        f"Symbols with multiple ENSGs: "
        f"{df[df.duplicated(['symbol','region'])].symbol.unique()}"
    )
    return df


def merge_shet(left, right):
    df = left.merge(right, how="left", validate="many_to_one")
    logger.info(f"Merged data shape: {df.shape}")
    logger.info(f"Unique ENSGs after merge: {df.ensg.nunique()}")
    logger.info(
        f"ENSGs with shet scores: "
        f"{df[['ensg','shet_post_mean']].drop_duplicates('ensg').shet_post_mean.count()}"
    )
    return df


def merge_omim(left, right):
    df = left.merge(right, how="left", validate="many_to_one")

    logger.info(f"Merged data shape: {df.shape}")
    logger.info(f"Unique ENSGs after merge: {df.ensg.nunique()}")
    logger.info(
        f"ENSGs with OMIM annotations: {df[['ensg','omim_inheritance']].drop_duplicates().omim_inheritance.count()}"
    )

    return df


def reorder_columns(df):
    return df[
        [
            "symbol",
            "ensg",
            "enst",
            "region",
            "csq",
            "n_pos",
            "n_obs",
            "n_exp",
            "prop_obs",
            "prop_exp",
            "oe",
            "oe_ci_hi",
            "p",
            "fdr_p",
            "syn_p",
            "constraint",
            "pli",
            "loeuf",
            "gnomad_flags",
            "shet_post_mean",
            "omim_phenotype",
            "omim_inheritance",
        ]
    ]


def pivot_regions(df):
    return (
        df.assign(region=lambda x: pd.Categorical(x["region"], ordered=True))
        .pivot(
            index=[
                "symbol",
                "ensg",
                "enst",
                "csq",
                "pli",
                "loeuf",
                "gnomad_flags",
                "shet_post_mean",
                "omim_phenotype",
                "omim_inheritance",
            ],
            columns="region",
        )
        .swaplevel(axis=1)
        .sort_values("region", axis=1, kind="stable", ascending=False)
        .pipe(src.flatten_columns)
        .reset_index()
    )


def write_out(df, path=FILE_OUT):
    logger.info(f"Writing to {FILE_OUT}")
    df.to_csv(path, sep="\t", index=False)
    return df


def main():
    """Run as script."""

    constraint = read_constraint()
    gene_ids = read_gene_ids()
    shet = read_shet()
    omim = read_omim().pipe(implode_omim)

    df = (
        merge_gene_ids(constraint, gene_ids)
        .pipe(merge_shet, shet)
        .pipe(merge_omim, omim)
        .pipe(reorder_columns)
        .pipe(pivot_regions)
        .pipe(write_out)
    )

    return df

In [55]:
main()

Unnamed: 0,symbol,ensg,enst,csq,pli,loeuf,gnomad_flags,shet_post_mean,omim_phenotype,omim_inheritance,...,distal_nmd_n_obs,distal_nmd_n_exp,distal_nmd_prop_obs,distal_nmd_prop_exp,distal_nmd_oe,distal_nmd_oe_ci_hi,distal_nmd_p,distal_nmd_fdr_p,distal_nmd_syn_p,distal_nmd_constraint
0,A1BG,ENSG00000121410,ENST00000263100,stop_gained,1.712900e-16,1.342,[],0.001046,,,...,1.0,0.891221,0.200000,0.178244,1.122056,3.688245,0.788634,0.933139,0.094062,unconstrained
1,A1CF,ENSG00000148584,ENST00000373997,stop_gained,7.432800e-10,0.825,[],0.017065,,,...,3.0,3.636467,0.142857,0.173165,0.824977,1.901140,0.493509,0.760591,0.499340,unconstrained
2,A2M,ENSG00000175899,ENST00000318602,stop_gained,9.133100e-20,0.766,[],0.008673,,,...,1.0,0.924889,0.142857,0.132127,1.081211,3.940930,0.770878,0.924931,0.627012,unconstrained
3,A2ML1,ENSG00000166535,ENST00000299698,stop_gained,1.573400e-40,0.954,[],0.000688,,,...,0.0,1.138807,0.000000,0.162687,0.000000,2.140086,0.254166,0.541479,0.762868,indeterminate
4,A3GALT2,ENSG00000184389,ENST00000442999,stop_gained,1.007500e-06,1.530,"[""outlier_mis""]",0.001287,,,...,27.0,17.133257,0.355263,0.225438,1.575883,2.019578,0.997801,1.000000,0.996520,unconstrained
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18364,ZYG11A,ENSG00000203995,ENST00000371528,stop_gained,7.166000e-14,1.013,[],0.001005,,,...,3.0,6.040575,0.083333,0.167794,0.496641,1.200998,0.108653,0.328732,0.476621,unconstrained
18365,ZYG11B,ENSG00000162378,ENST00000294353,stop_gained,6.554500e-01,0.537,[],0.136814,,,...,4.0,6.125010,0.108108,0.165541,0.653060,1.392665,0.225969,0.508262,0.605044,unconstrained
18366,ZYX,ENSG00000159840,ENST00000322764,stop_gained,1.554800e-08,0.864,[],0.008858,,,...,3.0,2.460270,0.166667,0.136682,1.219378,2.755886,0.794000,0.935394,0.001771,unconstrained
18367,ZZEF1,ENSG00000074755,ENST00000381638,stop_gained,2.819800e-09,0.517,[],0.055615,,,...,0.0,2.354559,0.000000,0.196213,0.000000,1.125929,0.061346,0.227600,0.929483,indeterminate
