# Label constrained regions
This script labels the NMD regions within each transcript with a constraint annotation

In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_constraint_stats(path):
    """Read the constraint data into memory"""

    df = pd.read_csv(
        path,
        sep="\t",
        usecols=["region", "enst", "csq", "n_obs", "oe", "z", "p", "fdr_p"],
    )

    return df

In [3]:
def pivot_constraint_stats(df):
    """Pivot the dataframe to access key statistics for synonymous and nonsense variants
    in each transcript and region.
    For example, synonymous Z-scores in each transcript / region are required for
    subsequent constraint annotations.
    """

    df = (
        df.pivot(
            index=["region", "enst"],
            columns="csq",
            values=["n_obs", "oe", "z", "p", "fdr_p"],
        )
        .swaplevel(
            axis=1,
        )
        .reset_index(
            drop=False,
        )
    )

    return df

In [4]:
def flatten_multiindex_column(df, sep="_"):
    """Flatten a multiindex column.
    Column names from each level are separated by the given character.
    """

    df.columns = [sep.join(x).strip(sep) for x in df.columns.values]

    return df

In [5]:
def keep_constraint_columns(df):
    """Keep only those columns needed to calculate constraint"""

    df = df[
        [
            "region",
            "enst",
            "nonsense_n_obs",
            "nonsense_oe",
            "synonymous_z",
            "nonsense_p",
            "nonsense_fdr_p",
        ]
    ]

    return df

In [6]:
def filter_constrained_regions(df):
    """Filter for constrained and unconstrained regions / transcripts"""

    # Filtering masks
    m1 = df["nonsense_oe"] < 0.35
    m2 = df["synonymous_z"] > -1
    m3 = df["nonsense_fdr_p"] < 0.05

    m4 = df["nonsense_p"] >= 0.05
    m5 = df["nonsense_n_obs"] >= 1

    # Apply filters
    df.loc[m1 & m2 & m3, "constraint"] = "constrained"
    df.loc[m4 & m5, "constraint"] = "unconstrained"

    # Drop irrelevant columns
    df = df[["region", "enst", "constraint"]]

    # Drop regions lacking a definitive constraint annotation
    df = df.dropna()

    # Print summary statistics
    print("Constrained region value counts:")
    print(df.groupby("constraint").region.value_counts())

    return df

In [7]:
def write_constraint_labels_to_output(df, path):
    """Save to output"""
    df.to_csv(path, sep="\t", index=False)

    return df

In [8]:
if __name__ == "__main__":
    in_file_path = "../outputs/expected_variants_all_regions_no_cpg_stats.tsv"
    out_file_path = "../outputs/constrained_regions_labels.tsv"

    df = (
        read_constraint_stats(in_file_path)
        .pipe(pivot_constraint_stats)
        .pipe(flatten_multiindex_column)
        .pipe(keep_constraint_columns)
        .pipe(filter_constrained_regions)
        .pipe(write_constraint_labels_to_output, out_file_path)
    )

Constrained region value counts:
constraint     region        
constrained    transcript         3538
               nmd_target         2380
               distal_nmd          548
               long_exon           388
unconstrained  distal_nmd        11233
               start_proximal    10933
               transcript         8880
               nmd_target         7300
               long_exon          1519
Name: region, dtype: int64


## Missing transcripts and regions
Some transcripts and regions are missing a constraint annotation. There are three possibilities:
- The region does not exist (e.g. transcripts without a long exon >400nt in length)
- No qualifying variants were identified in UKB. Likely due to capture / coverage issues. These transcripts are excluded from the analysis.
- The region does not meet the criteria to be classified as either constrained or unconstrained; they are intermediate between the two categories.