In [1]:
import numpy as np
import h5py
import scipy.sparse as sp

In [2]:
def make_tree(genotypes):
    # iterate through variants like brick graph algorithm
    pass

def consolidate_haplotypes(genotypes, interval):
    genotypes_interval = genotypes[:, interval[0]:interval[1]+1] # inclusive
    unique_haplotypes, haplotype_counts = np.unique(genotypes_interval, axis=0, return_counts=True)
    return unique_haplotypes, haplotype_counts


def sort_intervals(intervals):
    return sorted(intervals, key=lambda x: x[1] - x[0] + 1, reverse=True)
    
    
def get_no_recom_intervals(genotypes):
    genotypes_t = genotypes.T
    intervals = [] # intervals with no recombination
    for i in range(genotypes.shape[1]):
        intervals.append([i, i]) # start a new interval
        v1 = np.array(genotypes_t[i]).ravel()
        for interval in [x for x in intervals if x[1]+1 == i]:
            passes = True
            for j in range(interval[0], interval[1]+1):
                v2 = np.array(genotypes_t[j]).ravel()
                if not four_gametes_test(v1, v2):
                    passes = False
                    break
            if passes:
                interval[1] += 1
    return intervals

    
def four_gametes_test(a, b, strict=True):
    gametes = set(np.unique(a * 2 + b))
    if strict:
        failing_condition = {1, 2, 3}.issubset(gametes) # 10, 01, 11
    else:
        failing_condition = len(gametes) == 4 # 00, 10, 01, 11
    if failing_condition:
        return False
    else:
        return True

In [3]:
test_genotypes = np.array([
    [1, 0, 0, 1, 0, 1], 
    [0, 1, 0, 0, 1, 0],  
    [1, 1, 0, 1, 1, 0], 
    [0, 0, 1, 0, 0, 1],  
])

get_no_recom_intervals(test_genotypes)

[[0, 0], [1, 2], [2, 3], [3, 3], [4, 5], [5, 5]]

In [4]:
# genotypes = np.genfromtxt('/Users/ambershen/Desktop/linARG/dx_analysis/figures/1a/data/hapmap3_CEU_chr2-234876004-234884481.txt', dtype=float, missing_values="NA", filling_values=np.nan)

with h5py.File("/Users/ambershen/Desktop/linARG/dx_analysis/figures/1a/data/genotype_matrices/0_chr2-234000000-236000000.h5", 'r') as f:
    genotypes = sp.csc_matrix((f['data'][:], f['indices'][:], f['indptr'][:]), shape=f['shape'][:]) 
genotypes = genotypes.todense()

In [4]:
import pandas as pd

df = pd.read_csv('/Users/ambershen/Desktop/linARG/dx_analysis/figures/1a/data/variant_metadata/0_chr2-234000000-236000000.txt', sep=' ')

In [7]:
df.iloc[2826]

CHROM               chr2
POS            234383207
ID       2:234383207:T:C
REF                    T
ALT                    C
Name: 2826, dtype: object

In [8]:
df.iloc[2842]

CHROM               chr2
POS            234384556
ID       2:234384556:G:A
REF                    G
ALT                    A
Name: 2842, dtype: object

In [5]:
intervals = get_no_recom_intervals(genotypes)
longest_interval = get_longest_interval(intervals)
genotypes_cons, counts = consolidate_haplotypes(genotypes, longest_interval)

print(f'genotypes shape: {genotypes.shape}')
print(f'longest interval: {longest_interval}, {longest_interval[1] - longest_interval[0] + 1}')
print(f'genotypes_cons shape: {genotypes_cons.shape}')
print(f'genotypes_cons nnz: {np.count_nonzero((genotypes_cons))}')
genotypes_cons

genotypes shape: (6404, 14515)
longest interval: [2826, 2842], 17
genotypes_cons shape: (8, 17)
genotypes_cons nnz: 50


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]], dtype=int16)

In [8]:
intervals = get_no_recom_intervals(genotypes)
sorted_intervals = sort_intervals(intervals)

for i in sorted_intervals[:10]:
    genotypes_cons, counts = consolidate_haplotypes(genotypes, i)
    print(f'interval: {i}')
    print(f'unique haplotypes shape: {genotypes_cons.shape}')
    print(f'unique haplotypes nnz: {np.count_nonzero((genotypes_cons))}')
    print(f'haplotype counts: {counts}')
    print(genotypes_cons)
    print()

interval: [2826, 2842]
unique haplotypes shape: (8, 17)
unique haplotypes nnz: 50
haplotype counts: [1001 3248  556  622  802    5   91   79]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0]
 [0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1]
 [0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
 [1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1]]

interval: [2876, 2892]
unique haplotypes shape: (12, 17)
unique haplotypes nnz: 32
haplotype counts: [5014   96   94  120   66  145  504   89  101   83    1   91]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]
 [1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 1 0 0 0 0 0 0