# Conservation statistics

In [1]:
%%bash
dx download -f -o ../outputs/ outputs/nmd_annotations.tsv
dx download -f -o ../outputs/ outputs/phylop_all_sites.tsv

In [1]:
import numpy as np
import pandas as pd

In [2]:
nmd = pd.read_csv(
    "../outputs/nmd_annotations.tsv",
    sep="\t",
    usecols=["chr", "pos", "transcript_id", "nmd_definitive"],
).rename(columns={"nmd_definitive": "region", "transcript_id": "enst"})

In [3]:
phylop = pd.read_csv("../outputs/phylop_all_sites.tsv", sep="\t")

In [4]:
print(f"Sites with an NMD annotation: {len(nmd)}")
print(f"Sites with a phyloP annotation: {len(phylop)}")

Sites with an NMD annotation: 34177971
Sites with a phyloP annotation: 33461142


## Regional conservation 

In [5]:
df = nmd.merge(phylop, how="inner")
print(f"Sites after merging NMD and phyloP annotations: {len(df)}")

Sites after merging NMD and phyloP annotations: 34055400


In [49]:
df["fracCdsCons"] = np.where(df["phylop"] >= 2.27, 1, 0)

# Regional
stats = (
    df.groupby("region")
    .agg({"fracCdsCons": "mean", "pos": "count"})
    .rename(columns={"pos": "n"})
)
stats["se"] = np.sqrt((stats["fracCdsCons"] * (1 - stats["fracCdsCons"])) / stats["n"])
stats["ci95"] = 1.96 * stats["se"]

# Transcript level
cons = df["fracCdsCons"].mean()
n = df["fracCdsCons"].count()
se = np.sqrt((cons * (1 - cons)) / n)
ci = 1.96 * se

transcript = pd.DataFrame(
    [[cons, n, se, ci]], columns=stats.columns, index=["transcript"]
)

# Combined
stats = pd.concat([stats, transcript])
stats

Unnamed: 0,fracCdsCons,n,se,ci95
distal_nmd,0.494952,7474963,0.000183,0.000358
long_exon,0.460485,2703867,0.000303,0.000594
nmd_target,0.625169,20893422,0.000106,0.000208
start_proximal,0.547126,2983148,0.000288,0.000565
transcript,0.576675,34055400,8.5e-05,0.000166


In [50]:
stats.to_csv("../outputs/phylop_stats_region.tsv", sep="\t", index=False)

## Regional conservation, stratified by constraint

In [55]:
constraint = (
    pd.read_csv(
        "../outputs/expected_variants_all_regions_no_cpg_stats.tsv",
        sep="\t",
        usecols=["region", "enst", "csq", "n_obs", "oe", "z", "p", "fdr_p"],
    )
    .pivot(
        index=["region", "enst"],
        columns="csq",
        values=["n_obs", "oe", "z", "p", "fdr_p"],
    )
    .swaplevel(
        axis=1,
    )
    .reset_index(
        drop=False,
    )
)
constraint.columns = ["_".join(x).strip("_") for x in constraint.columns.values]
constraint = constraint[
    [
        "region",
        "enst",
        "nonsense_n_obs",
        "nonsense_oe",
        "synonymous_z",
        "nonsense_p",
        "nonsense_fdr_p",
    ]
]

m1 = constraint["nonsense_oe"] < 0.35
m2 = constraint["synonymous_z"] > -1
m3 = constraint["nonsense_fdr_p"] < 0.05

m4 = constraint["nonsense_p"] >= 0.05
m5 = constraint["nonsense_n_obs"] >= 1

constraint.loc[m1 & m2 & m3, "constraint"] = "constrained"
constraint.loc[m4 & m5, "constraint"] = "unconstrained"

constraint = constraint[["region", "enst", "constraint"]].dropna()

print(constraint.groupby(["region"])["constraint"].value_counts())

constraint.head(3)

region          constraint   
distal_nmd      unconstrained    11233
                constrained        548
long_exon       unconstrained     1519
                constrained        388
nmd_target      unconstrained     7300
                constrained       2380
start_proximal  unconstrained    10933
transcript      unconstrained     8880
                constrained       3538
Name: constraint, dtype: int64


Unnamed: 0,region,enst,constraint
3,distal_nmd,ENST00000001008,unconstrained
5,distal_nmd,ENST00000002125,unconstrained
6,distal_nmd,ENST00000002165,unconstrained


In [73]:
# Get conservation per region and transcript

## Regions
region = (
    df.groupby(["enst", "region"])
    .agg({"fracCdsCons": "mean", "pos": "count"})
    .reset_index(drop=False).rename(columns={"pos":"n"})
)

region = region.merge(constraint, how="inner")

for g in region.groupby(["region","constraint"]):
    

TypeError: 'tuple' object does not support item assignment