# Label constrained regions
This script labels the NMD regions within each transcript with a constraint annotation

In [1]:
import numpy as np
import pandas as pd

In [3]:
# Read the constraint data into memory
df = (
    pd.read_csv(
        "../outputs/expected_variants_all_regions_no_cpg_stats.tsv",
        sep="\t",
        usecols=["region", "enst", "csq", "n_obs", "oe", "z", "p", "fdr_p"],
    )
    .pivot(  # We need, for example, synonymous Z-scores for later filtering
        index=["region", "enst"],
        columns="csq",
        values=["n_obs", "oe", "z", "p", "fdr_p"],
    )
    .swaplevel(
        axis=1,
    )
    .reset_index(
        drop=False,
    )
)
df.head(3)

csq,region,enst,missense,nonsense,synonymous,missense,nonsense,synonymous,missense,nonsense,synonymous,missense,nonsense,synonymous,missense,nonsense,synonymous
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,n_obs,n_obs,n_obs,oe,oe,oe,z,z,z,p,p,p,fdr_p,fdr_p,fdr_p
0,distal_nmd,ENST00000000233,19.0,0.0,13.0,0.701366,0.0,1.008534,-1.639154,-1.731266,0.031739,0.050591,0.041702,0.51266,0.127855,0.145396,0.981847
1,distal_nmd,ENST00000000412,49.0,0.0,12.0,1.339896,0.0,0.819672,2.165101,-1.523933,-0.740937,0.98481,0.063763,0.229366,1.0,0.184955,0.772735
2,distal_nmd,ENST00000000442,56.0,0.0,28.0,0.869565,0.0,0.896287,-1.104962,-1.698114,-0.623648,0.134588,0.044743,0.266429,0.271585,0.15148,0.808838


In [4]:
# Find constrained and unconstrained regions
## The columns are a multi-index which need to be merged
df.columns = ["_".join(x).strip("_") for x in df.columns.values]

## Keep only the relevant columns
df = df[
    [
        "region",
        "enst",
        "nonsense_n_obs",
        "nonsense_oe",
        "synonymous_z",
        "nonsense_p",
        "nonsense_fdr_p",
    ]
]

## Filter for constrained and unconstrained regions / transcripts
m1 = df["nonsense_oe"] < 0.35
m2 = df["synonymous_z"] > -1
m3 = df["nonsense_fdr_p"] < 0.05

m4 = df["nonsense_p"] >= 0.05
m5 = df["nonsense_n_obs"] >= 1

df.loc[m1 & m2 & m3, "constraint"] = "constrained"
df.loc[m4 & m5, "constraint"] = "unconstrained"

## Drop irrelevant columns, and regions which lack definitive constraint annotations
df = df[["region", "enst", "constraint"]].dropna()

df.head(3)

Unnamed: 0,region,enst,constraint
3,distal_nmd,ENST00000001008,unconstrained
5,distal_nmd,ENST00000002125,unconstrained
6,distal_nmd,ENST00000002165,unconstrained


## Missing transcripts and regions
Some transcripts and regions are missing a constraint annotation. There are three possibilities:
- The region does not exist (e.g. transcripts without a long exon >400nt in length)
- No qualifying variants were identified in UKB. Likely due to capture / coverage issues. These transcripts are excluded from the analysis.
- The region does not meet the criteria to be classified as either constrained or unconstrained; they are intermediate between the two categories.

In [15]:
# Save to output
df.to_csv("../outputs/constrained_regions_labels.tsv", sep="\t", index=False)