In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import src
from src import pandas_utils as pdu

from experiments.variant_density import parse_gnomad_variants

FILE_IN="data/interim/gnomad_snrna_variants_tidy_inbreeding_coeff.tsv"
FILE_OUT="data/final/gnomad_snrna_variants_hwe_stats.tsv"

In [2]:
def per_row_chi2(row):
    an = row["an"]
    obs = row["het_obs"]
    obs_remainder = an - obs
    exp = row["het_exp"]
    exp_remainder = an - exp

    chi2, pval = stats.chisquare([obs, obs_remainder], [exp, exp_remainder])
    return chi2, pval

In [3]:
df = (
    pd.read_csv(FILE_IN, sep="\t")
    # .check.disable_checks(enable_asserts=False)
    .check.nrows(check_name="Input variants")
    .loc[lambda x: ~x["chrom"].isin(["chrX", "chrY"])]
    .check.nrows(check_name="Variants after dropping sex chromosomes")
    .check.value_counts("filter", check_name="Variant counts by filter:")
    .pipe(parse_gnomad_variants.parse_gnomad_variants)
    # .check.enable_checks()
    .loc[
        :,
        [
            "chrom",
            "pos",
            "ref",
            "alt",
            "ac",
            "an",
            "af",
            "nhomalt",
            "allele_type",
            "coi",
            "symbol",
            "ensg",
            "gene_type",
        ],
    ]
    .assign(
        oe=lambda x: 1 - x["coi"],
        het_obs=lambda x: x["ac"] - 2 * x["nhomalt"],
        het_exp=lambda x: x["het_obs"] / x["oe"],
    )
    .check.function(
        lambda x: x["het_exp"].dropna().shape[0],
        check_name='Variants with valid "het_exp" values: ',
    )
    .pipe(pdu.assign_with_per_row_fn, per_row_chi2, new_cols=["chi2", "chi2_pval"])
    .dropna(subset="chi2_pval")
    .check.nrows(check_name="Variants with valid chi squared P values:")
    .check.function(
        lambda x: f"Bonferroni significance threshold = {0.05 / len(x):.2e}"
    )
    .sort_values("oe", ascending=False)  # Important for ranking
    .assign(
        bfr_sig=lambda x: np.where(x["chi2_pval"] < 0.05 / len(x), True, False),
        rank_oe=lambda x: x["oe"].rank(ascending=False, method="first").astype(int),
        rank_p=lambda x: x["chi2_pval"]
        .rank(ascending=True, method="first")
        .astype(int),
    )
    .check.write(FILE_OUT, index=False)
)

<h5 style='text-align: left'><span style='color:None; background-color:None'>Input variants: 54237</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants after dropping sex chromosomes: 52683</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Variant counts by filter:</span></h5>

Unnamed: 0_level_0,count
filter,Unnamed: 1_level_1
PASS,52661
InbreedingCoeff,22


<h5 style='text-align: left'><span style='color:None; background-color:None'>Allele type value counts:</span></h5>

Unnamed: 0_level_0,count
allele_type,Unnamed: 1_level_1
SNV,47948
Indel,4735


<h5 style='text-align: left'><span style='color:None; background-color:None'>Gene type value counts:</span></h5>

Unnamed: 0_level_0,count
gene_type,Unnamed: 1_level_1
Pseudogene,1660
snRNA,112


<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate variants: 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of variants: 52683</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of genes: 1772</span></h5>

<h5 style='text-align: left'><span style='color:black; background-color:green'>Pass assert: all AC > 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate gene symbols, different ENSG IDs:</span></h5>

Unnamed: 0_level_0,count
symbol,Unnamed: 1_level_1
U6,30
U2,14
U4,7
U7,6
U1,3


<h5 style='text-align: left'><span style='color:None; background-color:None'>ENSG IDs on multiple chromosomes:</span></h5>

Unnamed: 0,chrom,ensg


<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants with valid "het_exp" values: : 52604</span></h5>

  terms = (f_obs - f_exp)**2 / f_exp


<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants with valid chi squared P values:: 52593</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Bonferroni significance threshold = 9.51e-07</span></h5>

chrom	pos	ref	alt	ac	an	nhomalt	allele_type	coi	symbol	ensg	gene_type	oe	het_obs	het_exp

chr1	16896104	A	G	944	151294	0	SNV	-0.006	RNU1-2	ENSG00000207005.1	snRNA	1.006	944	938.076