In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats import contingency_tables

import src
from src import pandas_utils as pdu

from experiments.variant_density import parse_gnomad_variants

FILE_IN="data/interim/gnomad_snrna_variants_tidy_inbreeding_coeff.tsv"
FILE_OUT="data/final/gnomad_snrna_variants_hwe_stats.tsv"

In [2]:
def per_row_chi2(row):
    n_genotypes = row["an"] / 2
    obs = row["het_obs"]
    obs_remainder = n_genotypes - obs
    exp = row["het_exp"]
    exp_remainder = n_genotypes - exp

    try:
        odds_ratio, pval = stats.fisher_exact([[obs, obs_remainder], [exp, exp_remainder]])
    except ValueError:
        odds_ratio, pval = np.nan, np.nan
    
    return odds_ratio, pval

In [3]:
def per_row_fisher_exact(row):
    n_genotypes = row["an"] / 2
    obs = row["het_obs"]
    obs_remainder = n_genotypes - obs
    exp = row["het_exp"]
    exp_remainder = n_genotypes - exp

    table = [[obs, obs_remainder], [exp, exp_remainder]]
    table2x2 = contingency_tables.Table2x2(table)

    try:
        odds_ratio = table2x2.oddsratio
        pval = stats.fisher_exact(table).pvalue
    except ValueError:
        odds_ratio, pval = np.nan, np.nan
    
    return odds_ratio, pval

In [6]:
df = (
    pd.read_csv(FILE_IN, sep="\t")
    .check.nrows(check_name="Input variants")
    .loc[lambda x: ~x["chrom"].isin(["chrX", "chrY"])]
    .check.nrows(check_name="Variants after dropping sex chromosomes")
    .check.value_counts("filter", check_name="Variant counts by filter:")
    .pipe(parse_gnomad_variants.parse_gnomad_variants)
    .loc[
        :,
        [
            "chrom",
            "pos",
            "ref",
            "alt",
            "ac",
            "an",
            "af",
            "nhomalt",
            "allele_type",
            "coi",
            "symbol",
            "ensg",
            "gene_type",
        ],
    ]
    .assign(
        oe=lambda x: 1 - x["coi"],
        het_obs=lambda x: x["ac"] - 2 * x["nhomalt"],
        het_exp=lambda x: x["het_obs"] / x["oe"],
    )
    .check.function(
        lambda x: x["het_exp"].dropna().shape[0],
        check_name='Variants with valid "het_exp" values: ',
    )
    .pipe(pdu.assign_with_per_row_fn, per_row_fisher_exact, new_cols=["odds_ratio", "p_val"])
    .dropna(subset="p_val")
    .check.nrows(check_name="Variants with valid chi squared P values:")
    .check.function(
        lambda x: f"Bonferroni significance threshold = {0.05 / len(x):.2e}"
    )
    .sort_values("oe", ascending=False)  # Important for ranking
    .assign(
        bfr_sig=lambda x: np.where(x["p_val"] < 0.05 / len(x), True, False),
        rank_oe=lambda x: x["oe"].rank(ascending=False, method="first").astype(int),
        rank_p=lambda x: x["p_val"]
        .rank(ascending=True, method="first")
        .astype(int),
    )
    .check.function(lambda x: x["bfr_sig"].sum(), check_name="Number of Bonferroni-significant variants:")
    .check.write(FILE_OUT, index=False)
)

<h5 style='text-align: left'><span style='color:None; background-color:None'>Input variants: 29487</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants after dropping sex chromosomes: 28716</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Variant counts by filter:</span></h5>

Unnamed: 0_level_0,count
filter,Unnamed: 1_level_1
PASS,28708
InbreedingCoeff,8


<h5 style='text-align: left'><span style='color:None; background-color:None'>Allele type value counts:</span></h5>

Unnamed: 0_level_0,count
allele_type,Unnamed: 1_level_1
SNV,28716


<h5 style='text-align: left'><span style='color:None; background-color:None'>Gene type value counts:</span></h5>

Unnamed: 0_level_0,count
gene_type,Unnamed: 1_level_1
Pseudogene,1652
snRNA,109


<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate variants: 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of variants: 28716</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of genes: 1761</span></h5>

<h5 style='text-align: left'><span style='color:black; background-color:green'>Pass assert: all AC > 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate gene symbols, different ENSG IDs:</span></h5>

Unnamed: 0_level_0,count
symbol,Unnamed: 1_level_1
U6,30
U2,11
U4,7
U7,6
U1,3


<h5 style='text-align: left'><span style='color:None; background-color:None'>ENSG IDs on multiple chromosomes:</span></h5>

Unnamed: 0,chrom,ensg


<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants with valid "het_exp" values: : 28683</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Variants with valid chi squared P values:: 28683</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Bonferroni significance threshold = 1.74e-06</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of Bonferroni-significant variants:: 210</span></h5>