# CADD scores
Summary statistics for CADD scores in constrained regions

## Script
Run the cells below as a script

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import numpy as np
import pandas as pd

from src import statistics_for_plots
from src import constants as C

In [3]:
# Module constants
_DTYPE = {"pos": np.int32, "cadd_phred": np.float16}
_USECOLS = ["csq", "cadd_phred", "region", "constraint"]

In [4]:
def get_cadd_stats(df, groupby=["constraint", "region"]):
    """Get summary stats of CADD scores by region and constraint."""

    # Lambda functions for 95% confidence intervals
    ci_l = lambda x: x["mean"] - 1.96 * x["sem"]
    ci_r = lambda x: x["mean"] + 1.96 * x["sem"]

    # Get statistics
    stats = (
        df.groupby(groupby)["cadd_phred"]
        .agg(n="count", mean="mean", std=np.std, sem="sem")
        .assign(ci_l=ci_l, ci_r=ci_r)
    )

    return stats

In [5]:
# Load data
df = (
    pd.read_csv(
        "data/interim/cadd_scores_coding_annotated.tsv",
        sep="\t",
        dtype=_DTYPE,
        usecols=_USECOLS,
        low_memory=False,
        # nrows=1000000,
    )
    .dropna(subset="constraint")
    .replace({"region": "distal_nmd"}, value="distal")
)
df.head(3)

Unnamed: 0,csq,cadd_phred,region,constraint
3366,missense_variant,18.1875,nmd_target,unconstrained
3367,missense_variant,18.6875,nmd_target,unconstrained
3368,missense_variant,20.40625,nmd_target,unconstrained


In [6]:
# Synonymous, missense, and nonsense by region
syn = df[df["csq"] == "synonymous_variant"].copy().pipe(get_cadd_stats)
mis = df[df["csq"] == "missense_variant"].copy().pipe(get_cadd_stats)
stop = df[df["csq"] == "stop_gained"].copy().pipe(get_cadd_stats)

# Variants across the whole transcript
whole_transcript = df.pipe(get_cadd_stats, groupby=["constraint", "csq"]).rename_axis(
    ["constraint", "region"]
)

In [7]:
# Concatenate the regional and transcript-level data
def cat_transcript_data(df):
    return pd.concat([df, whole_transcript], axis=0).sort_index()


def sort_region(df, **kwargs):
    kwargs.setdefault("categories", C.MAPS_CONSEQUENCES)
    kwargs.setdefault("labels", C.MAPS_LABELS)
    return (
        df.reset_index()
        .pipe(statistics_for_plots.sort_region_column, **kwargs)
        .set_index(["constraint", "region"])
        .sort_index()
    )


syn, mis, stop = [cat_transcript_data(x).pipe(sort_region) for x in [syn, mis, stop]]

In [8]:
# Write to output
for df, csq in zip([syn, mis, stop],["synonymous","missense","nonsense"]):
    df.to_csv(f"data/statistics/cadd_{csq}.tsv", sep="\t")

## Summary statistics
Show summary statistics and T tests for CADD scores of synonymous, missense, and nonsense variants.

In [9]:
syn

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,Nonsense (Distal),437959,6.195996,4.165403,0.006294,6.18366,6.208333
constrained,Nonsense (Long exon),379705,4.944524,3.905828,0.006339,4.932101,4.956948
constrained,Nonsense (Start proximal),12163,10.709746,4.21587,0.038227,10.634822,10.784671
constrained,Nonsense (NMD target),1648881,7.66736,4.355969,0.003392,7.660711,7.674009
constrained,Nonsense (Whole transcript),441371,39.310417,4.979606,0.007495,39.295726,39.325108
constrained,Missense,7889480,22.895041,6.499066,0.002314,22.890505,22.899576
constrained,Synonymous,2478708,7.005214,4.386016,0.002786,6.999754,7.010674
unconstrained,Nonsense (Distal),2045408,4.811914,3.922324,0.002743,4.806539,4.817289
unconstrained,Nonsense (Long exon),379749,4.558789,3.953905,0.006416,4.546214,4.571365
unconstrained,Nonsense (Start proximal),119332,7.758475,4.747952,0.013744,7.731536,7.785414


In [10]:
statistics_for_plots.test_constrained_vs_unconstrained(syn)

Ttest_indResult(statistic=region
Nonsense (Distal)               201.592444
Nonsense (Long exon)             42.768502
Nonsense (Start proximal)        72.651077
Nonsense (NMD target)           404.431802
Nonsense (Whole transcript)     291.751347
Missense                       1290.463904
Synonymous                      445.691884
Name: n, dtype: float64, pvalue=region
Nonsense (Distal)              0.0
Nonsense (Long exon)           0.0
Nonsense (Start proximal)      0.0
Nonsense (NMD target)          0.0
Nonsense (Whole transcript)    0.0
Missense                       0.0
Synonymous                     0.0
Name: n, dtype: float64)

In [11]:
mis

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,Nonsense (Distal),1379154,21.931675,6.812855,0.005801,21.920304,21.943045
constrained,Nonsense (Long exon),1168713,19.032591,7.664145,0.007089,19.018696,19.046486
constrained,Nonsense (Start proximal),40381,24.761478,5.5103,0.027421,24.707733,24.815224
constrained,Nonsense (NMD target),5301232,23.982969,5.732646,0.00249,23.978089,23.987849
constrained,Nonsense (Whole transcript),441371,39.310417,4.979606,0.007495,39.295726,39.325108
constrained,Missense,7889480,22.895041,6.499066,0.002314,22.890505,22.899576
constrained,Synonymous,2478708,7.005214,4.386016,0.002786,6.999754,7.010674
unconstrained,Nonsense (Distal),6439827,17.814287,8.414864,0.003316,17.807788,17.820786
unconstrained,Nonsense (Long exon),1164575,16.259029,8.692609,0.008055,16.243242,16.274817
unconstrained,Nonsense (Start proximal),381902,19.712873,8.086061,0.013085,19.687228,19.738519


In [12]:
statistics_for_plots.test_constrained_vs_unconstrained(mis)

Ttest_indResult(statistic=region
Nonsense (Distal)               616.182696
Nonsense (Long exon)            258.475356
Nonsense (Start proximal)       166.165135
Nonsense (NMD target)          1146.938876
Nonsense (Whole transcript)     291.751347
Missense                       1290.463904
Synonymous                      445.691884
Name: n, dtype: float64, pvalue=region
Nonsense (Distal)              0.0
Nonsense (Long exon)           0.0
Nonsense (Start proximal)      0.0
Nonsense (NMD target)          0.0
Nonsense (Whole transcript)    0.0
Missense                       0.0
Synonymous                     0.0
Name: n, dtype: float64)

In [13]:
stop

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,Nonsense (Distal),73283,38.026928,4.119901,0.015219,37.997099,38.056757
constrained,Nonsense (Long exon),62381,36.392929,3.686023,0.014758,36.364003,36.421855
constrained,Nonsense (Start proximal),3029,39.773754,4.705481,0.085498,39.606179,39.94133
constrained,Nonsense (NMD target),302678,40.217819,5.103816,0.009277,40.199636,40.236002
constrained,Nonsense (Whole transcript),441371,39.310417,4.979606,0.007495,39.295726,39.325108
constrained,Missense,7889480,22.895041,6.499066,0.002314,22.890505,22.899576
constrained,Synonymous,2478708,7.005214,4.386016,0.002786,6.999754,7.010674
unconstrained,Nonsense (Distal),364721,35.473591,5.348079,0.008856,35.456234,35.490948
unconstrained,Nonsense (Long exon),62005,34.363693,4.701548,0.018881,34.326686,34.4007
unconstrained,Nonsense (Start proximal),28840,36.80854,5.511323,0.032453,36.744932,36.872149


In [14]:
statistics_for_plots.test_constrained_vs_unconstrained(stop)

Ttest_indResult(statistic=region
Nonsense (Distal)               145.010820
Nonsense (Long exon)             84.676598
Nonsense (Start proximal)        32.424492
Nonsense (NMD target)           232.901300
Nonsense (Whole transcript)     291.751347
Missense                       1290.463904
Synonymous                      445.691884
Name: n, dtype: float64, pvalue=region
Nonsense (Distal)               0.000000e+00
Nonsense (Long exon)            0.000000e+00
Nonsense (Start proximal)      9.939165e-205
Nonsense (NMD target)           0.000000e+00
Nonsense (Whole transcript)     0.000000e+00
Missense                        0.000000e+00
Synonymous                      0.000000e+00
Name: n, dtype: float64)