# Merge annotations
Merge our NMD annotations with external annotations (phyloP, HMC, and pext)

## Preliminaries

In [None]:
%%bash
# Download NMD annotations and phyloP scores from UKB RAP
dx download -f -o ../outputs/ outputs/nmd_annotations.tsv
dx download -f -o ../outputs/ outputs/phylop_all_sites.tsv
dx download -f -o ../outputs/ outputs/pext_38.bed
dx download -f -o ../outputs/ outputs/hmc_38.tsv
dx download -f -o ../outputs/ outputs/gene_ids.tsv

In [None]:
# Import relevant modules
import numpy as np
import pandas as pd

## Get annotations

### NMD annotations

In [None]:
# Read NMD data into memory
nmd = pd.read_csv(
    "../outputs/nmd_annotations.tsv",
    sep="\t",
    usecols=["chr", "pos", "transcript_id", "nmd_definitive"],
).rename(columns={"nmd_definitive": "region", "transcript_id": "enst"})

# Print summary statistics
print(f"Sites with an NMD annotation: {len(nmd)}")

### phyloP annotations

In [None]:
# Read phyloP data into memory
phylop = pd.read_csv("../outputs/phylop_all_sites.tsv", sep="\t")

# Print summary statistics
print(f"Sites with a phyloP annotation: {len(phylop)}")

### pext annotations

In [None]:
# Read pext data into memory
pext = (
    pd.read_csv(
        "../outputs/pext_38.bed", 
        sep="\t", header=None, names=["chr","start","end","ensg","pext"], 
        usecols=["chr","end","ensg","pext"]
    )
    .rename(columns={"end":"pos"})
    .drop_duplicates()
    .drop_duplicates(["chr","pos","ensg"], keep=False)
)
print(f"Valid pext annotations: {len(pext)}")

In [None]:
# Read gene and transcript ids into memory
ids = (
    pd.read_csv(
        "../outputs/gene_ids.tsv", 
        sep="\t", 
        header=0,
        names=["ensg","enst","hgnc"], 
        usecols=["ensg","enst"]
    )
)
ids["ensg"] = ids["ensg"].str.split(".").str[0]
ids["enst"] = ids["enst"].str.split(".").str[0]

ids = ids.drop_duplicates()

pext = pext.merge(ids, how="inner").drop("ensg", axis=1)
print(f"Valid pext annotations in genes with a MANE transcript: {len(pext)}")

### HMC annotations

In [None]:
hmc = (
    pd.read_csv(
        "../outputs/hmc_38.tsv", 
        sep="\t", 
        header=None, 
        names=["chr","pos","hmc"]
    )
    .sort_values(["chr","pos","hmc"])
    .drop_duplicates(["chr","pos"]) # Keep the lowest HMC score (most constrained) per site
)
print(f"Number of HMC annotations: {len(hmc)}")

### Constraint annotations

In [None]:
# Read the constraint data into memory
constraint = (
    pd.read_csv(
        "../outputs/expected_variants_all_regions_no_cpg_stats.tsv",
        sep="\t",
        usecols=["region", "enst", "csq", "n_obs", "oe", "z", "p", "fdr_p"],
    )
    .pivot( # We need, for example, synonymous Z-scores for later filtering
        index=["region", "enst"],
        columns="csq",
        values=["n_obs", "oe", "z", "p", "fdr_p"],
    )
    .swaplevel(
        axis=1,
    )
    .reset_index(
        drop=False,
    )
)

In [None]:
# Find constrained and unconstrained regions

## The columns are a multi-index which need to be merged
constraint.columns = ["_".join(x).strip("_") for x in constraint.columns.values]

## Keep only the relevant columns
constraint = constraint[
    [
        "region",
        "enst",
        "nonsense_n_obs",
        "nonsense_oe",
        "synonymous_z",
        "nonsense_p",
        "nonsense_fdr_p",
    ]
]

## Filter for constrained and unconstrained regions / transcripts
m1 = constraint["nonsense_oe"] < 0.35
m2 = constraint["synonymous_z"] > -1
m3 = constraint["nonsense_fdr_p"] < 0.05

m4 = constraint["nonsense_p"] >= 0.05
m5 = constraint["nonsense_n_obs"] >= 1

constraint.loc[m1 & m2 & m3, "constraint"] = "constrained"
constraint.loc[m4 & m5, "constraint"] = "unconstrained"

## Drop irrelevant columns 
constraint = constraint[["region", "enst", "constraint"]]

## Print the counts of constrained and unconstrained regions
print(constraint.groupby(["region"])["constraint"].value_counts())

## Merge annotations

In [None]:
# NMD and phyloP
df = nmd.merge(phylop, how="left")
print(f"Sites after merging NMD and phyloP annotations: {len(df)}")
print(f"Sites with a phyloP annotation: {len(df) - df.phylop.isna().sum()}")

# pext
df = df.merge(pext, how="left")
print(f"Sites after merging pext annotations: {len(df)}")
print(f"Sites with a pext annotation: {len(df) - df.pext.isna().sum()}")

# hmc
df = df.merge(hmc, how="left")
print(f"Sites after merging with HMC annotation: {len(df)}")
print(f"Sites with an HMC annotation: {len(df) - df.hmc.isna().sum()}")

In [None]:
# In order to get transcript-level statistics, we copy the dataframe and overwrite the "region" annotation.
_ = df.copy().assign(region="transcript")
df = pd.concat([df, _])

In [None]:
# Merge with constraint annotations
df = df.merge(constraint, how="inner")

## Write to output

In [None]:
# Write to output
df.to_csv("../outputs/orthogonal_annotations.tsv", sep="\t", index=False)