# Region lengths
This script produces summary statistics describing the length and count of NMD regions, stratified by constraint.

## Imports

In [1]:
import numpy as np
import pandas as pd

## Read data

In [2]:
# Constraint data
constraint = pd.read_csv(
    "../outputs/constrained_regions_labels.tsv", sep="\t", dtype="category"
)

In [3]:
# NMD annotations
nmd = pd.read_csv(
    "../outputs/nmd_annotations.tsv",
    sep="\t",
    usecols=["transcript_id", "nmd_definitive"],
    dtype="category",
).set_axis(["enst", "region"], axis=1)

# Get the size of each region
nmd = nmd.groupby(["enst", "region"])["region"].count().rename("size").reset_index()

# Drop regions with size 0
nmd = nmd[nmd["size"] != 0]

# Get the size of the whole CDS
cds = (
    nmd.groupby("enst")["size"]
    .sum()
    .reset_index()
    .assign(region="transcript")
    .astype({"region": "category"})
)

# Append CDS data to NMD data
nmd = pd.concat([nmd, cds])

## Combine annotations

In [4]:
# Merge NMD and constraint annotations
df = nmd.merge(constraint, how="left").astype(
    {"enst": str, "region": str, "constraint": str}
)

# Fill NaN values in constraint column
df["constraint"] = df["constraint"].replace("nan", "indeterminate")

## Summary statistics
Produce summary statistics and write to output

### Region sizes

In [5]:
region_size = df.copy()
region_size.to_csv("../outputs/stats_region_size.tsv", sep="\t", index=False)
region_size.sample(5)

Unnamed: 0,enst,region,size,constraint
27011,ENST00000334181,start_proximal,150,unconstrained
45687,ENST00000421673,start_proximal,150,unconstrained
74533,ENST00000362057,transcript,2721,indeterminate
50431,ENST00000616417,distal_nmd,92,indeterminate
45976,ENST00000682019,distal_nmd,222,unconstrained


In [6]:
# Create a multi index for later re-indexing
index = pd.MultiIndex.from_product(
    [df["region"].unique(), df["constraint"].unique()], names=["region", "constraint"]
)

### Overall genomic footprint

In [7]:
region_footprint = (
    df.groupby(
        ["region", "constraint"],
    )["size"]
    .sum()
    .rename("size")
    .reindex(index)
    .reset_index()
).fillna(0)
region_footprint.to_csv("../outputs/stats_region_footprint.tsv", sep="\t", index=False)
region_footprint

Unnamed: 0,region,constraint,size
0,distal_nmd,unconstrained,4115545.0
1,distal_nmd,indeterminate,2657958.0
2,distal_nmd,constrained,745153.0
3,nmd_target,unconstrained,5942643.0
4,nmd_target,indeterminate,10435976.0
5,nmd_target,constrained,4584703.0
6,start_proximal,unconstrained,1639294.0
7,start_proximal,indeterminate,1350201.0
8,start_proximal,constrained,0.0
9,long_exon,unconstrained,796707.0


### Region counts

In [8]:
region_count = (
    df.groupby(["region", "constraint"])["size"]
    .count()
    .rename("n")
    .reindex(index)
    .reset_index()
    .fillna(0)
)
region_count.to_csv("../outputs/stats_region_count.tsv", sep="\t", index=False)
region_count

Unnamed: 0,region,constraint,n
0,distal_nmd,unconstrained,11233.0
1,distal_nmd,indeterminate,7739.0
2,distal_nmd,constrained,548.0
3,nmd_target,unconstrained,7300.0
4,nmd_target,indeterminate,7119.0
5,nmd_target,constrained,2380.0
6,start_proximal,unconstrained,10933.0
7,start_proximal,indeterminate,9031.0
8,start_proximal,constrained,0.0
9,long_exon,unconstrained,1519.0
