In [11]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm


N = 1000 # Number of locations to be simulated.
np.random.seed(42)

# Load bam_mappings metadata file.
BAM_MAPPINGS_FILE = '/home/groups/dpwall/briannac/general_data/bam_mappings.csv'
bam_mappings = pd.read_csv(BAM_MAPPINGS_FILE, sep='\t', index_col=1)
bam_mappings = bam_mappings[bam_mappings['status']=='Passed_QC_analysis_ready']
bam_mappings = bam_mappings.drop('09C86428')


# Load phasings information.
PHASINGS_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/'
OUT_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/counts/'

fam_region_to_idx = np.load(PHASINGS_DIR +  'fam_region_to_idx.npy', allow_pickle=True).item()
fam_idx_to_region = np.load(PHASINGS_DIR +  'idx_to_fam_region.npy', allow_pickle=True).item()
global_region_to_idx = np.load(PHASINGS_DIR +  'global_region_to_idx.npy', allow_pickle=True).item()
fam_idx_to_global_idx = np.load(PHASINGS_DIR + 'fam_regions_to_global_regions.npy', allow_pickle=True)
global_idx_to_fam_idx = np.load(PHASINGS_DIR + 'global_region_to_fam_regions.npy', allow_pickle=True)
family_info = pd.read_pickle(PHASINGS_DIR + 'fam_list.df')

# Get random locations.
locations = np.random.choice([list(global_region_to_idx.keys())[-1]], N, replace=True)
freqs = np.random.random(N)
location_idxs = [global_region_to_idx[l] for l in locations]
fam_regions = [[fam_idx_to_region[g] for g in global_idx_to_fam_idx[i]] for i in location_idxs]


# Save locations to file.
with open(OUT_DIR + 'synthetic_data_locations_y.txt', 'w') as f:
    for l,fr in zip(locations, freqs):
        f.writelines(l + '\t' + str(fr) + '\n')
        
        
# Get k-mer depth dictionary.
kmer_length=100
ihart_flagstat_file = '/home/groups/dpwall/briannac/blood_microbiome/data/ihart_flagstat.csv'
flagstat = pd.read_csv(ihart_flagstat_file, index_col=0)
flagstat = flagstat.loc[set(flagstat.index).intersection(bam_mappings.index)]
sex = bam_mappings.loc[flagstat.index].sex_numeric
total_mapped_reads = flagstat.ProperPair*((flagstat.Total_Reads-flagstat.Supplementary-flagstat.Duplicates)/flagstat.Total_Reads)
avg_coverage = total_mapped_reads*150/(6.27e9*(sex.astype(float)==1.0) + 6.37e9*(sex.astype(float)==2.0))
avg_n_100mers = (150-kmer_length)/(150/avg_coverage)
kmer_depth_dict = {k:avg_n_100mers[k] if k in avg_n_100mers else np.mean(avg_n_100mers.values) for k in bam_mappings.index}


with open('/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/phased_fams/phased_fams_all.pickle', 'rb') as f:
    phased_fam_dict = pickle.load(f)
        
# Simulate k-mer counts.
for REPEAT_AVG in [1,10,100]:
    print(REPEAT_AVG)
    counts = pd.DataFrame()


    for fam in tqdm(list(phased_fam_dict.keys())):
        if fam not in family_info.index: continue
        mother_sample = family_info.loc[fam].mother_sample
        father_sample = family_info.loc[fam].father_sample
        children_samples = family_info.loc[fam].sib_samples

        current_fam_regions = [set(f).intersection(phased_fam_dict[fam].index) for f in fam_regions]
        current_fam_regions = [list(c)[0] if len(c)==1 else np.random.choice(phased_fam_dict[fam].index) for c in current_fam_regions]
        phased_fam = phased_fam_dict[fam].loc[current_fam_regions][children_samples].values
        phased_fam = np.array([[(p[0]%4, p[1]%4) for p in pp] for pp in phased_fam])
        
        if REPEAT_AVG == 1:
            mom_g1 = (np.random.random(N)<=freqs)*1
            dad_g1 = (np.random.random(N)<=freqs)*1
            mom_g2 = (np.random.random(N)<=freqs)*1
            dad_g2 = (np.random.random(N)<=freqs)*1
        else:
            mom_g1 = (np.random.random(N)<=freqs)*np.random.poisson(REPEAT_AVG, N)
            dad_g1 = (np.random.random(N)<=freqs)*np.random.poisson(REPEAT_AVG, N)
            mom_g2 = (np.random.random(N)<=freqs)*np.random.poisson(REPEAT_AVG, N)
            dad_g2 = (np.random.random(N)<=freqs)*np.random.poisson(REPEAT_AVG, N)

        # Correct for X and Y chromsomes.
        dad_g2 = [0 if 'X' in r else g for g,r in zip(dad_g2, current_fam_regions)]
        
        mom_g1 = [0 if 'Y' in r else g for g,r in zip(mom_g1, current_fam_regions)]
        mom_g2 = [0 if 'Y' in r else g for g,r in zip(mom_g2, current_fam_regions)]
        dad_g1 = [0 if 'Y' in r else g for g,r in zip(dad_g1, current_fam_regions)]
        
        
        # Simulate parent's genotypes based on phasing and simulated info about sequence.
        mom_gts = np.array([mom_g1, mom_g2]).transpose()
        dad_gts = np.array([dad_g1, dad_g2]).transpose()

        # Simulate counts based on genotype and phasing.
        mom_counts = mom_g1*np.random.poisson(5, N)+mom_g2*np.random.poisson(5, N)
        dad_counts = dad_g1*np.random.poisson(5, N)+dad_g2*np.random.poisson(5, N)
        children_counts = [[
            kmer_depth_dict[children_samples[ch_idx]] * 
            (mom_gts[region_idx,phased_fam[region_idx, ch_idx][0]] + 
             dad_gts[region_idx,phased_fam[region_idx, ch_idx][1]])
                            for ch_idx in range(phased_fam.shape[1])] for region_idx in range(phased_fam.shape[0])]
        
        # Update total dataframe
        counts[mother_sample] = mom_counts
        counts[father_sample] = dad_counts
        for c_i, c in enumerate(children_samples):
            counts[c] = [children_count[c_i] for children_count in children_counts]
            
    counts.to_csv(OUT_DIR + 'synthetic_data_y_chrom_%i_repeats.tsv' % REPEAT_AVG, sep='\t', header=counts.columns)

 81%|████████▏ | 592/727 [00:53<00:07, 18.00it/s]
  0%|          | 0/727 [00:00<?, ?it/s][A
  0%|          | 2/727 [00:00<00:42, 17.10it/s][A

1



  1%|          | 4/727 [00:00<00:43, 16.71it/s][A
  1%|          | 8/727 [00:00<00:37, 19.05it/s][A
  2%|▏         | 11/727 [00:00<00:34, 20.47it/s][A
  2%|▏         | 14/727 [00:00<00:33, 21.21it/s][A
  2%|▏         | 16/727 [00:00<00:37, 19.21it/s][A
  2%|▏         | 18/727 [00:00<00:37, 18.90it/s][A
  3%|▎         | 20/727 [00:00<00:38, 18.16it/s][A
  3%|▎         | 22/727 [00:01<00:41, 17.07it/s][A
  3%|▎         | 24/727 [00:01<00:42, 16.49it/s][A
  4%|▎         | 26/727 [00:01<00:43, 16.18it/s][A
  4%|▍         | 28/727 [00:01<00:41, 16.66it/s][A
  4%|▍         | 30/727 [00:01<00:40, 17.07it/s][A
  5%|▍         | 33/727 [00:01<00:39, 17.66it/s][A
  5%|▍         | 35/727 [00:01<00:43, 16.00it/s][A
  5%|▌         | 37/727 [00:02<00:41, 16.63it/s][A
  5%|▌         | 39/727 [00:02<00:42, 16.00it/s][A
  6%|▌         | 41/727 [00:02<00:44, 15.25it/s][A
  6%|▌         | 43/727 [00:02<00:54, 12.49it/s][A
  6%|▌         | 45/727 [00:02<00:51, 13.25it/s][A
  7%|▋       

In [13]:
counts

Unnamed: 0,03C17241,03C17188,03C17189,03C17190,03C17191,08C74093,08C74092,08C74094,08C74095,03C16648,...,09C86312,05C46098,05C46097,05C46096,05C46099,04C30227,04C30228,04C30220,04C30222,04C30229
0,0,0,0.000000,0.0,0.000000,0,8,5.650537,5.159653,0,...,6.052551,0,0,0.000000,0.0,0,0,0.00000,0.000000,0.000000
1,0,9,7.797107,0.0,6.671554,0,8,5.650537,5.159653,0,...,6.052551,0,1,5.450209,0.0,0,5,6.21884,6.081589,5.917309
2,0,0,0.000000,0.0,0.000000,0,8,5.650537,5.159653,0,...,6.052551,0,0,0.000000,0.0,0,3,6.21884,6.081589,5.917309
3,0,2,7.797107,0.0,6.671554,0,3,5.650537,5.159653,0,...,0.000000,0,4,5.450209,0.0,0,0,0.00000,0.000000,0.000000
4,0,3,7.797107,0.0,6.671554,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000,0.0,0,0,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0.000000,0.0,0.000000,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000,0.0,0,0,0.00000,0.000000,0.000000
996,0,4,7.797107,0.0,6.671554,0,2,5.650537,5.159653,0,...,6.052551,0,6,5.450209,0.0,0,5,6.21884,6.081589,5.917309
997,0,0,0.000000,0.0,0.000000,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000,0.0,0,0,0.00000,0.000000,0.000000
998,0,5,7.797107,0.0,6.671554,0,9,5.650537,5.159653,0,...,6.052551,0,2,5.450209,0.0,0,4,6.21884,6.081589,5.917309


In [16]:
sex[counts.columns]

03C17241    2.0
03C17188    1.0
03C17189    1.0
03C17190    2.0
03C17191    1.0
           ... 
04C30227    2.0
04C30228    1.0
04C30220    1.0
04C30222    1.0
04C30229    1.0
Name: sex_numeric, Length: 3313, dtype: object