# Region lengths
This script produces summary statistics describing the length and count of NMD regions, stratified by constraint.

## Imports

In [1]:
import numpy as np
import pandas as pd

## Read data

In [2]:
# Constraint data
constraint = pd.read_csv(
    "../outputs/constrained_regions_labels.tsv", sep="\t", dtype="category"
)

In [3]:
# NMD annotations
nmd = pd.read_csv(
    "../outputs/nmd_annotations.tsv",
    sep="\t",
    usecols=["transcript_id", "nmd_definitive"],
    dtype="category",
).set_axis(["enst", "region"], axis=1)

# Get the size of each region
nmd = nmd.groupby(["enst", "region"])["region"].count().rename("size").reset_index()

# Drop regions with size 0
nmd = nmd[nmd["size"] != 0]

# Get the size of the whole CDS
cds = (
    nmd.groupby("enst")["size"]
    .sum()
    .reset_index()
    .assign(region="transcript")
    .astype({"region": "category"})
)

# Append CDS data to NMD data
nmd = pd.concat([nmd, cds])

## Combine annotations

In [4]:
# Merge NMD and constraint annotations
df = nmd.merge(constraint, how="left").astype(
    {"enst": str, "region": str, "constraint": str}
)

# Fill NaN values in constraint column
df["constraint"] = df["constraint"].replace("nan", "indeterminate")

## Summary statistics
Produce summary statistics and write to output

### Region sizes

In [5]:
region_size = df.copy()
region_size.to_csv("../outputs/stats_region_size.tsv", sep="\t", index=False)
region_size.sample(5)

Unnamed: 0,enst,region,size,constraint
31180,ENST00000642050,distal_nmd,170,unconstrained
14426,ENST00000412566,start_proximal,150,unconstrained
46710,ENST00000543494,nmd_target,18,indeterminate
21831,ENST00000396958,start_proximal,150,unconstrained
66356,ENST00000611815,transcript,2121,indeterminate


### Overall genomic footprint

In [6]:
region_footprint = (
    df.groupby(["region", "constraint"])["size"].sum().rename("size").reset_index()
)
region_footprint.to_csv("../outputs/stats_region_footprint.tsv", sep="\t", index=False)
region_footprint

Unnamed: 0,region,constraint,size
0,distal_nmd,constrained,745153
1,distal_nmd,indeterminate,2657958
2,distal_nmd,unconstrained,4115545
3,long_exon,constrained,725644
4,long_exon,indeterminate,1184147
5,long_exon,unconstrained,796707
6,nmd_target,constrained,4584703
7,nmd_target,indeterminate,10435976
8,nmd_target,unconstrained,5942643
9,start_proximal,indeterminate,1350201


### Region counts

In [7]:
region_count = (
    df.groupby(["region", "constraint"])["size"].count().rename("n").reset_index()
)
region_count.to_csv("../outputs/stats_region_count.tsv", sep="\t", index=False)
region_count

Unnamed: 0,region,constraint,n
0,distal_nmd,constrained,548
1,distal_nmd,indeterminate,7739
2,distal_nmd,unconstrained,11233
3,long_exon,constrained,388
4,long_exon,indeterminate,1733
5,long_exon,unconstrained,1519
6,nmd_target,constrained,2380
7,nmd_target,indeterminate,7119
8,nmd_target,unconstrained,7300
9,start_proximal,indeterminate,9031
