# CADD scores
Summary statistics for CADD scores in constrained regions

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# Imports
import numpy as np
import pandas as pd

from src import statistics_for_plots
from src import constants as C

In [16]:
# Module constants
_DTYPE = {"pos":np.int32, "cadd_phred":np.float16}
_USECOLS = ["csq","cadd_phred","region","constraint"]

In [36]:
def get_cadd_stats(df, groupby=["constraint","region"]):
    """Get summary stats of CADD scores by region and constraint."""
    
    # Lambda functions for 95% confidence intervals
    ci_l = lambda x: x["mean"] - 1.96 * x["sem"]
    ci_r = lambda x: x["mean"] + 1.96 * x["sem"]

    # Get statistics
    stats = (
        df.groupby(groupby)["cadd_phred"]
        .agg(n="count", mean="mean", std=np.std, sem="sem")
        .assign(ci_l=ci_l, ci_r=ci_r)
    )

    return stats

In [18]:
# Load data
df = pd.read_csv(
    "data/interim/cadd_scores_coding_annotated.tsv",
    sep="\t",
    dtype=_DTYPE,
    usecols=_USECOLS,
    low_memory=False
    # nrows=10000,
).dropna(subset="constraint")
df.head(3)

  df = pd.read_csv(


Unnamed: 0,csq,cadd_phred,region,constraint
3366,missense_variant,18.1875,nmd_target,unconstrained
3367,missense_variant,18.6875,nmd_target,unconstrained
3368,missense_variant,20.40625,nmd_target,unconstrained


In [49]:
# Synonymous, missense, and nonsense by region
syn = df[df["csq"] == "synonymous_variant"].copy().pipe(get_cadd_stats)
mis = df[df["csq"] == "missense_variant"].copy().pipe(get_cadd_stats)
stop = df[df["csq"] == "stop_gained"].copy().pipe(get_cadd_stats)

# Variants across the whole transcript
whole_transcript = df.pipe(get_cadd_stats, groupby=["constraint", "csq"]).rename_axis(
    ["constraint", "region"]
)

In [51]:
syn

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,distal_nmd,437959,6.195996,4.165403,0.006294,6.18366,6.208333
constrained,long_exon,379705,4.944524,3.905828,0.006339,4.932101,4.956948
constrained,nmd_target,1648881,7.66736,4.355969,0.003392,7.660711,7.674009
constrained,start_proximal,12163,10.709746,4.21587,0.038227,10.634822,10.784671
unconstrained,distal_nmd,2045408,4.811914,3.922324,0.002743,4.806539,4.817289
unconstrained,long_exon,379749,4.558789,3.953905,0.006416,4.546214,4.571365
unconstrained,nmd_target,3506345,5.996559,4.415186,0.002358,5.991937,6.00118
unconstrained,start_proximal,119332,7.758475,4.747952,0.013744,7.731536,7.785414


In [55]:
statistics_for_plots.test_constrained_vs_unconstrained(syn)

Ttest_indResult(statistic=region
distal_nmd        201.592444
long_exon          42.768502
nmd_target        404.431802
start_proximal     72.651077
Name: n, dtype: float64, pvalue=region
distal_nmd        0.0
long_exon         0.0
nmd_target        0.0
start_proximal    0.0
Name: n, dtype: float64)

In [52]:
mis

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,distal_nmd,1379154,21.931675,6.812855,0.005801,21.920304,21.943045
constrained,long_exon,1168713,19.032591,7.664145,0.007089,19.018696,19.046486
constrained,nmd_target,5301232,23.982969,5.732646,0.00249,23.978089,23.987849
constrained,start_proximal,40381,24.761478,5.5103,0.027421,24.707733,24.815224
unconstrained,distal_nmd,6439827,17.814287,8.414864,0.003316,17.807788,17.820786
unconstrained,long_exon,1164575,16.259029,8.692609,0.008055,16.243242,16.274817
unconstrained,nmd_target,11132334,19.989874,8.119354,0.002433,19.985104,19.994644
unconstrained,start_proximal,381902,19.712873,8.086061,0.013085,19.687228,19.738519


In [56]:
statistics_for_plots.test_constrained_vs_unconstrained(mis)

Ttest_indResult(statistic=region
distal_nmd         616.182696
long_exon          258.475356
nmd_target        1146.938876
start_proximal     166.165135
Name: n, dtype: float64, pvalue=region
distal_nmd        0.0
long_exon         0.0
nmd_target        0.0
start_proximal    0.0
Name: n, dtype: float64)

In [53]:
stop

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,distal_nmd,73283,38.026928,4.119901,0.015219,37.997099,38.056757
constrained,long_exon,62381,36.392929,3.686023,0.014758,36.364003,36.421855
constrained,nmd_target,302678,40.217819,5.103816,0.009277,40.199636,40.236002
constrained,start_proximal,3029,39.773754,4.705481,0.085498,39.606179,39.94133
unconstrained,distal_nmd,364721,35.473591,5.348079,0.008856,35.456234,35.490948
unconstrained,long_exon,62005,34.363693,4.701548,0.018881,34.326686,34.4007
unconstrained,nmd_target,630287,37.502808,5.604338,0.007059,37.488972,37.516644
unconstrained,start_proximal,28840,36.80854,5.511323,0.032453,36.744932,36.872149


In [57]:
statistics_for_plots.test_constrained_vs_unconstrained(stop)

Ttest_indResult(statistic=region
distal_nmd        145.010820
long_exon          84.676598
nmd_target        232.901300
start_proximal     32.424492
Name: n, dtype: float64, pvalue=region
distal_nmd         0.000000e+00
long_exon          0.000000e+00
nmd_target         0.000000e+00
start_proximal    9.939165e-205
Name: n, dtype: float64)

In [54]:
whole_transcript

Unnamed: 0_level_0,Unnamed: 1_level_0,n,mean,std,sem,ci_l,ci_r
constraint,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
constrained,missense_variant,7889480,22.895041,6.499066,0.002314,22.890505,22.899576
constrained,stop_gained,441371,39.310417,4.979606,0.007495,39.295726,39.325108
constrained,synonymous_variant,2478708,7.005214,4.386016,0.002786,6.999754,7.010674
unconstrained,missense_variant,19118638,19.024269,8.346093,0.001909,19.020528,19.02801
unconstrained,stop_gained,1085853,36.623535,5.576067,0.005351,36.613047,36.634023
unconstrained,synonymous_variant,6050834,5.540618,4.287202,0.001743,5.537202,5.544034


In [58]:
statistics_for_plots.test_constrained_vs_unconstrained(whole_transcript)

Ttest_indResult(statistic=region
missense_variant      1290.463904
stop_gained            291.751347
synonymous_variant     445.691884
Name: n, dtype: float64, pvalue=region
missense_variant      0.0
stop_gained           0.0
synonymous_variant    0.0
Name: n, dtype: float64)