In [5]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import json

PHASINGS_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/'
BAM_MAPPINGS_FILE = '/home/groups/dpwall/briannac/general_data/bam_mappings.csv'
KMER_COUNTS_FILE = '/home/groups/dpwall/briannac/alt_haplotypes/intermediate_files/kmers/kmers.0.tsv.gz'

In [6]:
# Load in family region/global region conversion data.
fam_region_to_idx = np.load(PHASINGS_DIR +  'fam_region_to_idx.npy', allow_pickle=True).item()
idx_to_fam_region = np.load(PHASINGS_DIR +  'idx_to_fam_region.npy', allow_pickle=True).item()
global_region_to_idx = np.load(PHASINGS_DIR +  'global_region_to_idx.npy', allow_pickle=True).item()
idx_to_global_region = np.load(PHASINGS_DIR +  'idx_to_global_region.npy', allow_pickle=True).item()
fam_region_to_global_region = np.load(PHASINGS_DIR + 'fam_regions_to_global_regions.npy', allow_pickle=True)

# Test pipeline on a family

In [7]:
bam_mappings = pd.read_csv(BAM_MAPPINGS_FILE, sep='\t', index_col=1)
bam_mappings = bam_mappings[bam_mappings['status']=='Passed_QC_analysis_ready']
sample_id_to_participant = {sample_id:participant_id for participant_id, sample_id in zip(bam_mappings.participant_id, bam_mappings.index)}

fam_number = 1 #int(sys.argv[1])
family_info = pd.read_pickle(PHASINGS_DIR + 'fam_list.df')
fam = family_info.iloc[fam_number].fam
mom = sample_id_to_participant[family_info.iloc[fam_number].mother_sample]
dad = sample_id_to_participant[family_info.iloc[fam_number].father_sample]
children = [sample_id_to_participant[s] for s in family_info.iloc[fam_number].sib_samples]

In [10]:
for chunk in pd.read_table(KMER_COUNTS_FILE):
    kmer_counts = chunk
    break
kmer_counts.columns = [sample_id_to_participant[c] for c in kmer_counts.columns]

KeyboardInterrupt: 

In [9]:
kmer_counts

Unnamed: 0_level_0,AU1005202,AU1005302,AU1005301,AU1004202,AU1004301,AU0918302,AU0918202,AU0918301,AU0918201,AU0918303,...,AU4237201,AU4237202,AU4237303,AU4237304,AU4138202,AU3859201,AU3859202,AU3859303,AU3859301,AU3859302
02C10540,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02C10540,02C10541,02C10542,02C10543,02C10702,02C10704,02C10855,02C10856,02C10857,02C10858,02C10859,...,MH0138051,MH0138052,MH0138054,MH0138055,MH0138989,MH0143008,MH0143009,MH0143013,MH0143018,MH0143019
22951,53783,18408,17742,4979,53161,83963,109072,101577,75124,87546,...,,,,,,,,,,250845
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0


In [32]:
######### Come up with possible phasings that correspond to k-mers #######

df_all = kmer_counts[np.append(children, [mom, dad])].copy()
df_all[pd.isna(df_all)] = 0
df_all = df_all>0

# Impossible inheritance patterns, kmer is something weird.
df_impossible = df_all[(df_all[mom]==False) & (df_all[dad]==False)] 
df_all = df_all[(df_all[mom]==True) | (df_all[dad]==True)]

# Unclassifieable: Kmer is in all siblings, so parent could be homozygous kmer.
df_unclassifiable = df_all[(df_all[children]).sum(axis=1)==len(children)]
df_all = df_all[(df_all[children]).sum(axis=1)!=len(children)]

df_all['kmer'] = df_all.index


### Perform aggregation to get sets of sibs with shared maternal/paternal k-mer ####
### Final index is {sibs_with_kmer, sibs_without_kmer}, value is kmers.
# Extract maternally inherited, paternally inherited, and either inherited
df_mat = df_all[(df_all[mom]==True) & (df_all[dad]==False)].drop([mom, dad], axis=1)
df_pat = df_all[(df_all[mom]==False) & (df_all[dad]==True)].drop([mom, dad], axis=1)
df_both = df_all[(df_all[mom]==True) & (df_all[dad]==True)].drop([mom, dad], axis=1)

# Compute sib sets for maternally inherited.
df_mat=df_mat.groupby(list(children)).aggregate(list) 
df_mat['sets'] = [frozenset([frozenset(children[np.where(np.array(i)==True)[0]]), frozenset(children[np.where(np.array(i)==False)[0]])]) for i in df_mat.index]
df_mat = df_mat.groupby('sets').aggregate(lambda x: list(x))
df_mat['kmer'] = [set(np.concatenate(k)) for k in df_mat['kmer']]
df_mat['n_kmers'] = [len(k) for k in df_mat.kmer]

# Compute sib sets for paternally inherited: 
df_pat=df_pat.groupby(list(children)).aggregate(list)
df_pat['sets'] = [frozenset([frozenset(children[np.where(np.array(i)==True)[0]]), frozenset(children[np.where(np.array(i)==False)[0]])]) for i in df_pat.index]
df_pat = df_pat.groupby('sets').aggregate(lambda x: list(x))
df_pat['kmer'] = [set(np.concatenate(k)) for k in df_pat['kmer']]
df_pat['n_kmers'] = [len(k) for k in df_pat.kmer]

# Compute sib sets for both.  This has a slightly different structure: Index is [{sibs_with_kmer}, {sibs_without_kmer}]
df_both=df_both.groupby(list(children)).aggregate(list)
df_both['sets'] = [(frozenset(children[np.where(np.array(i)==True)[0]]), frozenset(children[np.where(np.array(i)==False)[0]])) for i in df_both.index]
df_both = df_both.groupby('sets').aggregate(lambda x: list(x))
df_both['kmer'] = [set(np.concatenate(k)) for k in df_both['kmer']]
df_both['n_kmers'] = [len(k) for k in df_both.kmer]

df_mat = df_mat.sort_values('n_kmers', ascending=False)
df_pat = df_pat.sort_values('n_kmers', ascending=False)
df_both = df_both.sort_values('n_kmers', ascending=False)

KeyError: "None of [Index(['AU002203', 'AU002204', 'AU002205', 'AU002201', 'AU002202'], dtype='object')] are in the [columns]"

# Phasing

In [37]:
phased_fam = pd.read_csv('/oak/stanford/groups/dpwall/users/kpaskov/PhasingFamilies/phased_ihart/%s.phased.txt' % fam, sep='\t')
phased_fam['region'] = ['%s.%i.%i' % (chrom.replace('chr', ''), int(i),int(j)) for chrom,i,j in zip(phased_fam.chrom, phased_fam.start_pos, phased_fam.end_pos)]
phased_fam.drop([mom+'_mat', dad + '_mat', mom + '_pat', dad + '_pat'], axis=1, inplace=True)
# Get phased groupings
for i in children:
    phased_fam[i] = [(i,j) for i,j in zip(phased_fam[i+'_mat'].replace(np.nan,-1), phased_fam[i+ '_pat'].replace(np.nan,-1))]
    
# Phased from mat groupings
phased_fam_mat = phased_fam[[s for s in phased_fam.columns if 'mat' in s] + ['region']].groupby([s for s in phased_fam.columns if 'mat' in s]).aggregate(list)
phased_fam_mat['sets'] = [(frozenset(children[np.where(np.array(i)==0)[0]]), frozenset(children[np.where(np.array(i)==1)[0]])) for i in phased_fam_mat.index]
phased_fam_mat = phased_fam_mat.groupby('sets').aggregate(list)
phased_fam_mat['region'] = [[i for j in k for i in j] for k in phased_fam_mat['region']]

# Phased fam pat
phased_fam_pat = phased_fam[[s for s in phased_fam.columns if 'pat' in s] + ['region']].groupby([s for s in phased_fam.columns if 'pat' in s]).aggregate(list)
phased_fam_pat['sets'] = [(frozenset(children[np.where(np.array(i)==2)[0]]), frozenset(children[np.where(np.array(i)==3)[0]])) for i in phased_fam_pat.index]
phased_fam_pat = phased_fam_pat.groupby('sets').aggregate(list)
phased_fam_pat['region'] = [[i for j in k for i in j] for k in phased_fam_pat['region']]

# Phased fam both
phased_fam_both = phased_fam[list(children) + ['region']].groupby(list(children)).aggregate(list)


# Process regions/kmers inherited from both/either parent. 
sample_name_to_num = {j:i for i,j in enumerate(phased_fam_both.index.names)}
possible_regions_both = [[] for i in df_both.index]
for i_kmer,child_with_kmer_index in enumerate(df_both.index):
    possible_regions_ = []
    
    # Edge case where both parents have k-mer and no children do.
    if len(child_with_kmer_index[0])==0:
        for phased_fam_index,region in zip(phased_fam_both.index, phased_fam_both.region):
            if (((1.0 not in {float(i[0]) for i in phased_fam_index}) or  (0.0 not in {float(i[0]) for i in phased_fam_index})) &
                ((2.0 not in {float(i[1]) for i in phased_fam_index}) or  (3.0 not in {float(i[1]) for i in phased_fam_index}))):
                possible_regions_ = possible_regions_ + region
                
    for phased_fam_index,region in zip(phased_fam_both.index, phased_fam_both.region):
        if sum([phased_fam_index[sample_name_to_num[has_kmer]]==phased_fam_index[sample_name_to_num[no_kmer]] for has_kmer in child_with_kmer_index[0] for no_kmer in child_with_kmer_index[1]])==0:
            possible_regions_ = possible_regions_ + region
    possible_regions_both[i_kmer] = possible_regions_
    
# Process regions/kmers inherited from mom.
sample_name_to_num = {j:i for i,j in enumerate(phased_fam_mat.index.names)}
possible_regions_mat = [[] for i in df_mat.index]
for i_kmer,child_with_kmer_index in enumerate(df_mat.index):
    possible_regions_ = []
    child_kmer_sets = list(child_with_kmer_index)
    children_with_kmer = child_kmer_sets[0]
    children_without_kmer = child_kmer_sets[1]
    for phased_fam_index,region in zip(phased_fam_mat.index, phased_fam_mat.region):
        child_set_1 = phased_fam_index[0]
        child_set_2 = phased_fam_index[1]
        if (
            (len(child_set_1.difference(children_with_kmer))==0) & (len(child_set_2.difference(children_without_kmer))==0)
        ) or (
            (len(child_set_2.difference(children_with_kmer))==0) & (len(child_set_1.difference(children_without_kmer))==0)
        ):
            possible_regions_ = possible_regions_ + region
    possible_regions_mat[i_kmer] = possible_regions_
    
# Process regions/kmers inherited from dad.
sample_name_to_num = {j:i for i,j in enumerate(phased_fam_pat.index.names)}
possible_regions_pat = [[] for i in df_pat.index]
for i_kmer,child_with_kmer_index in enumerate(df_pat.index):
    possible_regions_ = []
    child_kmer_sets = list(child_with_kmer_index)
    children_with_kmer = child_kmer_sets[0]
    children_without_kmer = child_kmer_sets[1]
    for phased_fam_index,region in zip(phased_fam_pat.index, phased_fam_pat.region):
        child_set_1 = phased_fam_index[0]
        child_set_2 = phased_fam_index[1]
        if (
            (len(child_set_1.difference(children_with_kmer))==0) & (len(child_set_2.difference(children_without_kmer))==0)
        ) or (
            (len(child_set_2.difference(children_with_kmer))==0) & (len(child_set_1.difference(children_without_kmer))==0)
        ):
            possible_regions_ = possible_regions_ + region
    possible_regions_pat[i_kmer] = possible_regions_
    
    
    
##### Create family vote matrix #####
family_vote = pd.DataFrame(np.zeros((len(kmer_counts), len(phased_fam.region)+2)))
family_vote.columns = list(phased_fam.region) + ['impossible', 'ambigious']
family_vote.index = kmer_counts.index
family_vote.loc[df_impossible.index,'impossible']=1
family_vote.loc[df_unclassifiable.index,'ambigious']=1
for i in range(len(df_pat)):
    family_vote.loc[df_pat.iloc[i].kmer, possible_regions_pat[i]] = 1
for i in range(len(df_mat)):
    family_vote.loc[df_mat.iloc[i].kmer, possible_regions_mat[i]] = 1
for i in range(len(df_both)):
    family_vote.loc[df_both.iloc[i].kmer, possible_regions_both[i]] = 1

# Match phasings to kmers

In [41]:
global_vote = np.zeros((len(kmer_counts), len(global_region_to_idx)))
global_vote

In [47]:
len(fam_region_to_idx)

3238440

In [4]:
# Load in family region/global region conversion data.
with open('/home/groups/dpwall/briannac/alt_haplotypes/intermediate_files/phasings/' + 'fam_regions_to_global_regions_21.json', 'r') as f:
    fam_regions_to_global_regions = json.load(f)

In [59]:
PHASINGS_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/'


In [None]:
# Load in family region/global region conversion data.
fam_region_to_idx = np.load(PHASINGS_DIR +  'fam_region_to_idx.npy', allow_pickle=True).item()
idx_to_fam_region = np.load(PHASINGS_DIR +  'idx_to_fam_region.npy', allow_pickle=True).item()
global_region_to_idx = np.load(PHASINGS_DIR +  'global_region_to_idx.npy', allow_pickle=True).item()
idx_to_global_region = np.load(PHASINGS_DIR +  'idx_to_global_region.npy', allow_pickle=True).item()

fam_region_to_global_region = np.load(PHASINGS_DIR + 'fam_regions_to_global_regions.npy', allow_pickle=True).item()

In [1]:
len(global_region_to_idx)

NameError: name 'global_region_to_idx' is not defined

# Create table/dictionary to convert family region to global region

In [4]:
# Load in phased regions.
import pandas as pd
import numpy as np
import sys
import json
import tqdm
from collections import Counter
FINAL_PHASINGS_DIR='/home/groups/dpwall/briannac/alt_haplotypes/data/phasings'


start_ends = pd.read_pickle(FINAL_PHASINGS_DIR + '/family_regions.df')
family_region_to_global_regions_dict = {}

In [39]:
with open(FINAL_PHASINGS_DIR + '/fam_regions_to_global_regions.json', 'w') as f:
    json.dump(family_region_to_global_regions_dict, f)        
        
# Combine all dictionaries.
#fam_regions_to_global_regions_full_dict = {}
#for chrom in range(22):
#    with open(PHASINGS_DIR + '/fam_regions_to_global_regions_%s.json' % chrom, 'r') as fp:
#        new_dict = json.load(fp)
#    fam_regions_to_global_regions_full_dict.update(new_dict)

#with open(FINAL_PHASINGS_DIR + '/fam_regions_to_global_regions.json', 'w') as f:
#    json.dump(family_region_to_global_regions_dict, f)

In [147]:
print('saving...')
with open(PHASINGS_DIR + '/fam_regions_to_global_regions_%s.json' % chrom, 'w') as f:
    json.dump(family_region_to_global_regions_dict, f)

saving...


In [54]:
mappings = np.load('/home/groups/dpwall/briannac/alt_haplotypes/data/phasings/fam_regions_to_global_regions.npy', allow_pickle=True)