In [1]:
import pandas as pd
import pandas_checks as pdc
import numpy as np
import src

FILE_IN = "data/interim/gnomad_snrna_variants_tidy.tsv"

**NOTE** Several genes have the same HGNC symbol, but different ENSG IDs. Grouping should be done by ENSG ID. Specifically U1, U2, U4, U6, U7.

In [4]:
df = (
    pd.read_csv(FILE_IN, sep="\t", na_values=".")
    .pipe(
        lambda x: x.assign(
            **pd.DataFrame(
                x["bed_info"].str.split(",").to_list(),
                columns=["ensg", "symbol", "length"],
                index=x.index,
            )
        ).drop(columns=["bed_info"])
    )
    .astype({"length": "int"})
    .assign(
        allele_type=lambda x: np.where(x["allele_type"] == "snv", "SNV", "Indel"),
        gene_type=lambda x: np.where(
            x["symbol"].str.endswith("P"), "Pseudogene", "snRNA"
        ),
    )
    .check.value_counts("allele_type", check_name="Allele type value counts:")
    .check.function(lambda x: x.drop_duplicates("ensg")["gene_type"].value_counts(), check_name="Gene type value counts:")
    .check.ndups(subset=["chrom", "pos", "ref", "alt"], check_name="Duplicate variants")
    .check.nrows(check_name="Number of variants")
    .check.nunique("ensg", check_name="Number of genes")
    .check.assert_greater_than(0, subset="ac", pass_message="All AC > 0", verbose=True)
    .check.function(
        lambda x: x[["ensg", "symbol"]]
        .drop_duplicates()
        .loc[lambda x: x.duplicated("symbol", keep=False)]
        .sort_values("symbol")
        .value_counts("symbol"),
        check_name="Duplicate gene symbols, different ENSG IDs:",
    )
    .check.function(
        lambda x: x[["chrom", "ensg"]]
        .drop_duplicates()
        .loc[lambda x: x.duplicated("ensg", keep=False)],
        check_name="ENSG IDs on multiple chromosomes:",
    )
    .check.head()
)

<h5 style='text-align: left'><span style='color:None; background-color:None'>Allele type value counts:</span></h5>

Unnamed: 0_level_0,count
allele_type,Unnamed: 1_level_1
SNV,49368
Indel,4847


<h5 style='text-align: left'><span style='color:None; background-color:None'>Gene type value counts:</span></h5>

Unnamed: 0_level_0,count
gene_type,Unnamed: 1_level_1
Pseudogene,1748
snRNA,114


<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate variants: 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of variants: 54215</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Number of genes: 1862</span></h5>

<h5 style='text-align: left'><span style='color:black; background-color:green'>All AC > 0</span></h5>

<h5 style='text-align: left'><span style='color:None; background-color:None'>Duplicate gene symbols, different ENSG IDs:</span></h5>

Unnamed: 0_level_0,count
symbol,Unnamed: 1_level_1
U6,31
U2,14
U4,7
U7,7
U1,3


<h5 style='text-align: left'><span style='color:None; background-color:None'>ENSG IDs on multiple chromosomes:</span></h5>

Unnamed: 0,chrom,ensg


<h5 style='text-align: left'><span style='color:None; background-color:None'>First 5 rows</span></h5>

Unnamed: 0,chrom,pos,ref,alt,ac,an,af,nhomalt,ac_nfe,an_nfe,af_nfe,nhomalt_nfe,af_popmax,allele_type,cadd_phred,ensg,symbol,length,gene_type
0,chr1,157840,ATCTC,A,1,29526,0.0,0,1,13154,0.0,0,0.0,Indel,1.546,ENSG00000222623.1,RNU6-1100P,104,Pseudogene
1,chr1,758239,A,T,2,146046,0.0,0,1,68022,0.0,0,0.0,SNV,1.479,ENSG00000223181.1,RNU6-1199P,104,Pseudogene
2,chr1,758240,T,C,2,146026,0.0,0,1,68014,0.0,0,0.0,SNV,1.903,ENSG00000223181.1,RNU6-1199P,104,Pseudogene
3,chr1,758245,T,C,4207,145902,0.029,168,2678,67982,0.039,79,0.039,SNV,1.843,ENSG00000223181.1,RNU6-1199P,104,Pseudogene
4,chr1,758254,A,G,1,146004,0.0,0,0,68002,0.0,0,0.0,SNV,2.97,ENSG00000223181.1,RNU6-1199P,104,Pseudogene


In [3]:
(
    df.groupby(["ensg", "allele_type"])
    .agg(
        symbol=("symbol", "first"),
        gene_type=("gene_type", "first"),
        length=("length", "first"),
        n_variants=("chrom", "count"),
    )
    .assign(variants_per_nt=lambda x: x["n_variants"] / x["length"])
    .check.info()
    .check.head()
    .check.write("data/final/snrna_variant_counts.tsv", index=True)
)

<h5 style='text-align: left'><span style='color:None; background-color:None'>Info</span></h5>

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3292 entries, ('ENSG00000194297.2', 'SNV') to ('ENSG00000286172.1', 'SNV')
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   symbol           3292 non-null   object 
 1   gene_type        3292 non-null   object 
 2   length           3292 non-null   int64  
 3   n_variants       3292 non-null   int64  
 4   variants_per_nt  3292 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 152.9+ KB



<h5 style='text-align: left'><span style='color:None; background-color:None'>First 5 rows</span></h5>

Unnamed: 0_level_0,Unnamed: 1_level_0,symbol,gene_type,length,n_variants,variants_per_nt
ensg,allele_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000194297.2,SNV,RNU1-75P,Pseudogene,129,31,0.24
ENSG00000195024.2,SNV,RNU1-15P,Pseudogene,144,32,0.222
ENSG00000199217.1,Indel,RNU6-1123P,Pseudogene,106,1,0.009
ENSG00000199217.1,SNV,RNU6-1123P,Pseudogene,106,23,0.217
ENSG00000199219.1,Indel,RNU6-500P,Pseudogene,107,3,0.028


Unnamed: 0_level_0,Unnamed: 1_level_0,symbol,gene_type,length,n_variants,variants_per_nt
ensg,allele_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000194297.2,SNV,RNU1-75P,Pseudogene,129,31,0.240310
ENSG00000195024.2,SNV,RNU1-15P,Pseudogene,144,32,0.222222
ENSG00000199217.1,Indel,RNU6-1123P,Pseudogene,106,1,0.009434
ENSG00000199217.1,SNV,RNU6-1123P,Pseudogene,106,23,0.216981
ENSG00000199219.1,Indel,RNU6-500P,Pseudogene,107,3,0.028037
...,...,...,...,...,...,...
ENSG00000283575.1,SNV,U6,snRNA,96,35,0.364583
ENSG00000283666.1,Indel,U6,snRNA,103,2,0.019417
ENSG00000283666.1,SNV,U6,snRNA,103,20,0.194175
ENSG00000286172.1,Indel,RNVU1-8,snRNA,141,11,0.078014


We want:
- Gene symbols
- Variant types (i.e. indel vs SNV)
- Unique variants
- snRNA vs Pseudogene
- HWE statistics (autosomes only)
  - Strongest hits
  - Genes with the greatest number of genome-wide significant HWE outliers