# Mutation Statistics

We want to look which types of mutations occur in our data. Specifically, we want to count how many of the mutations are actually SNPs, and how many SNPs overlap on the same position.

In [8]:
import pandas as pd
import anndata
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
from functools import partial
from multiprocessing import Pool

In [9]:
NUM_EMBRYOS = 3

CALLS_SAVE_PATH = 'data/calls_2022_07_28_umi_filtered.h5ad'
READS_SAVE_PATH = 'data/reads_2022_07_28_umi_filtered.h5ad'

In [10]:
variants_joined = anndata.read_h5ad(CALLS_SAVE_PATH)
reads_joined = anndata.read_h5ad(READS_SAVE_PATH)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [4]:
reads_joined[:, ['chr17:39847607:T>A', 'chr17:39847607:T>C', 'chr17:39847607:T>G']].X[11]

ArrayView([ 8, 27,  8], dtype=int16)

In [5]:
xx = variants_joined[:, ['chr17:39847607:T>A', 'chr17:39847607:T>C', 'chr17:39847607:T>G']].X

In [7]:
xx[11]

ArrayView([3, 3, 3], dtype=int8)

In [15]:
# Grab the list of variants
variant_dat = list(variants_joined.var.index)

variant_chrom = []
variant_pos = []
abs_pos = []
variant_types = []

for variant in variant_dat:
    # Split the string to get each piece of info about the variant
    split = variant.split(':')
    
    variant_chrom.append(split[0])
    variant_pos.append(split[1])
    variant_types.append(split[2])
    abs_pos.append(split[0] + ':' + split[1])
    
# Make a dataframe
variant_df = pd.DataFrame(columns=['chromosome', 'position', 'abs_pos', 'type'], 
                          data=np.array([variant_chrom, variant_pos, abs_pos, variant_types]).T)

In [16]:
# Find which bases we begin with and what we end with in each variant
variant_df['start_bases'] = variant_df['type'].apply(lambda x: x.split('>')[0])
variant_df['end_bases'] = variant_df['type'].apply(lambda x: x.split('>')[1])

# If both the start base and end base are just a single base, we have a SNP
variant_df['is_SNP'] = variant_df[['start_bases', 'end_bases']].apply(lambda row: (len(row[0]) == 1) and (len(row[1]) == 1), axis=1)

In [26]:
val_counts = variant_df.abs_pos.value_counts()
val_counts = val_counts[val_counts > 1]
ex = val_counts.index[4]
variant_df.loc[variant_df['abs_pos'] == ex, :]

Unnamed: 0,chromosome,position,abs_pos,type,start_bases,end_bases,is_SNP
9762,chr17,39847607,chr17:39847607,T>A,T,A,True
9763,chr17,39847607,chr17:39847607,"T>A,C",T,"A,C",False
9764,chr17,39847607,chr17:39847607,T>C,T,C,True
9765,chr17,39847607,chr17:39847607,"T>C,G",T,"C,G",False
9766,chr17,39847607,chr17:39847607,T>G,T,G,True


In [25]:
val_counts

chr5:142903500    9
chr9:109078158    7
chr1:171358045    6
chr7:13035460     5
chr17:39847607    5
                 ..
chr5:149047419    2
chr12:96922293    2
chr7:28822170     2
chr7:84615215     2
chr8:70700320     2
Name: abs_pos, Length: 190, dtype: int64

In [57]:
num_SNPS = variant_df['is_SNP'].sum()
num_variants = variant_df.shape[0]

print(f'There are {num_SNPS} SNPS out of {num_variants} variants')

There are 23950 SNPS out of 27462 variants


In [52]:
# Restrict to SNPs and see how many variants share the same position
snps_df = variant_df.loc[variant_df.is_SNP, :].copy()

total_overlaps = 0

# Go by each chromosome and see how many overlapping SNPs there are
for chrom in snps_df.chromosome.unique():
    snps_df_chrom = snps_df.loc[snps_df.chromosome == chrom, :]
    
    # Count how many positions overlap on the chromosome
    position_counts = snps_df_chrom['position'].value_counts()
    overlaps = np.sum(position_counts > 1)
    total_overlaps += overlaps
    
print(f'There are {total_overlaps} overlapping SNPs in the dataset')

There are 34 overlapping SNPs in the dataset


In [13]:
total_overlaps = 0

# Go by each chromosome and see how many overlapping variants there are
for chrom in variant_df.chromosome.unique():
    variants_df_chrom = variant_df.loc[variant_df.chromosome == chrom, :]
    
    # Count how many positions overlap on the chromosome
    position_counts = variants_df_chrom['position'].value_counts()
    overlaps = np.sum(position_counts > 1)
    total_overlaps += overlaps
    
print(f'There are {total_overlaps} overlapping variants in the dataset')

There are 190 overlapping variants in the dataset
