# Maximum Likelihood Formulation


$P(K|r) = \prod_{f} P(K_f|r)  $    ***We want to find the region $r$ that maximizes P(K).***

$P(K_f|r) = \sum_{g_m,g_p \epsilon G} P(g_p|k_p)P(g_m|k_m) \prod_{c}P(k_{c}|g_{phase}(g_{p}, g_{m})) $  Expression for likelihood of family k-mer distribution.

$P(g_i|k) = \frac{P(k|g_i)}{\sum_{g_i\epsilon G} P(k|g_i)}  $  

Convert to log probability.

$log(P(K|r)) =  \sum_{fams}  \left ( log \sum_{g_{m_f}}\sum_{g_{p_f}} \prod_{c_f} P(k_{c_f}|g_p,g_m) - log \sum_{g_{p_f}} P(k_{p_f}|g_{p_f}) -  log \sum_{g_{m_f}} P(k_{m_f}|g_{m_f}) \right )$

$G = \left \{ 0/0,0/1,1/0,1/1 \right \}  $  Set of possible phased genotypes for k-mer.

$K$ = Distribution of a k-mer 

$r$ = region 

$f$ = family 

$c$ = children 

$g$ = phased genotype of k-mer 

$G$ = set of possible phased genotypes 

$g_{phase}$ = Phasing dictionary from Kelley's algorithm 

In [5]:
from scipy.stats import poisson
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import json

PHASINGS_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/'
BAM_MAPPINGS_FILE = '/home/groups/dpwall/briannac/general_data/bam_mappings.csv'
KMER_COUNTS_FILE = '/home/groups/dpwall/briannac/alt_haplotypes/intermediate_files/ground_truth/sample_kmer_matrix/kmers.filt.tsv'
KMERS_FILE = '/home/groups/dpwall/briannac/alt_haplotypes/intermediate_files/ground_truth/sample_kmer_matrix/known_alt_haplotype_kmers_filt.txt'

# Set up k-mer simulation

In [19]:
# Load in family region/global region conversion data.
fam_region_to_idx = np.load(PHASINGS_DIR +  'fam_region_to_idx.npy', allow_pickle=True).item()
idx_to_fam_region = np.load(PHASINGS_DIR +  'idx_to_fam_region.npy', allow_pickle=True).item()
global_region_to_idx = np.load(PHASINGS_DIR +  'global_region_to_idx.npy', allow_pickle=True).item()
idx_to_global_region = np.load(PHASINGS_DIR +  'idx_to_global_region.npy', allow_pickle=True).item()
fam_region_to_global_region = np.load(PHASINGS_DIR + 'fam_regions_to_global_regions.npy', allow_pickle=True)
family_info = pd.read_pickle(PHASINGS_DIR + 'fam_list.df')
global_region_to_fam_region = np.load(PHASINGS_DIR + 'global_region_to_fam_regions.npy', allow_pickle=True)


# Info from BAM mappings.
bam_mappings = pd.read_csv(BAM_MAPPINGS_FILE, sep='\t', index_col=1)
bam_mappings = bam_mappings[bam_mappings['status']=='Passed_QC_analysis_ready']
sample_id_to_participant = {sample_id:participant_id for participant_id, sample_id in zip(bam_mappings.participant_id, bam_mappings.index)}


# SET SEED
np.random.seed(42)

In [20]:
def simulate_kmer_family(phased_genotypes_children, kmer_pop_freq, avg_reads_fam):
    genotype_m = (np.random.random()<kmer_pop_freq, np.random.random()<kmer_pop_freq)
    genotype_p = (np.random.random()<kmer_pop_freq, np.random.random()<kmer_pop_freq)
    children_genotypes = [(genotype_m[g_m], genotype_p[g_p]) for g_m, g_p in phased_genotypes_children]
    kmer_counts = [np.random.poisson(sum(genotype_m)*avg_reads_fam), np.random.poisson(sum(genotype_p)*avg_reads_fam)] + [np.random.poisson(sum(g)*avg_reads_fam) for g in children_genotypes]
    return [k if k>1 else 0 for k in kmer_counts]

In [30]:
N_KMERS_TO_SIMULATE = 1000
selected_kmers = np.random.choice([i for i in range(len(global_region_to_idx))], N_KMERS_TO_SIMULATE, replace=False)
kmer_freq = {k: np.random.random() for k in selected_kmers}
freqs = [kmer_freq[global_idx] for global_idx in kmer_freq]
family_idxs = [[idx_to_fam_region[i] for i in global_region_to_fam_region[global_idx]] for global_idx in kmer_freq]

In [31]:
avg_read_count = 7.05 
fams_included = list(set(family_info.index).intersection([i.replace('.txt', '').split('/')[-1] for i in glob('/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/phased_fams/*.txt')]))
f = family_idxs[0]
kmer_counts = []
columns = []
for fam in tqdm(list(fams_included)):  
    # Extract mom, dad, and child sample_ids.
    children = family_info.loc[fam].sib_samples
    mom = family_info.loc[fam].mother_sample
    dad = family_info.loc[fam].father_sample
    
    phased_fam = pd.read_csv('/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/phased_fams/%s.txt' % fam,sep='\t')
    
    # Skip this family if children are missing.
    children_missing = False
    for ch in children:
        if sample_id_to_participant[ch] + '_mat' not in phased_fam.columns: 
            children_missing=True
            print(sample_id_to_participant[ch]+'_mat')
    if children_missing: continue

    
    phased_fam.index = ['%s.%09d.%09d' % (('0' + chrom[3:])[-2:].replace('0X', 'XX'), start, end) for chrom, start, end in phased_fam[['chrom', 'start_pos', 'end_pos']].values]
    
    # Find family regions that global regions correspond to (if no family region ie phasing is unknown, then pick a random region.)
    fam_regions = [list(set(phased_fam.index).intersection(f)) for f in family_idxs]
    fam_regions = [r[0] if len(r)>0 else np.random.choice(phased_fam.index) for r in fam_regions]
    # Set up phasing dataframe.
    phased_genotypes_children = [[(phased_fam[sample_id_to_participant[ch]+'_mat'].loc[r],phased_fam[sample_id_to_participant[ch]+'_pat'].loc[r]-2) for ch in children] for r in fam_regions]
    kmer_counts = kmer_counts + np.array([simulate_kmer_family(phased_genotype_children, kmer_pop_freq, avg_read_count) for 
                                           phased_genotype_children, kmer_pop_freq in zip(phased_genotypes_children, freqs)]).T.tolist()
    columns = columns + [mom, dad] + children

 71%|███████   | 517/728 [02:35<00:58,  3.60it/s]

AU008003_mat


100%|██████████| 728/728 [03:41<00:00,  3.28it/s]


In [32]:
df = pd.DataFrame(kmer_counts).T
df.columns = columns
df.index = ((i,kmer_freq[i]) for i in selected_kmers)
df.to_csv('/home/groups/dpwall/briannac/alt_haplotypes/results/simulated_data/kmer_counts.tsv', sep='\t')