In [84]:
# CS 124: Machine Learning in Genetics
# Project: Haplotype Phaser
# Contributors: Aditya Pimplaskar, Aditya Joglekar
import numpy as np
import pandas as pd
import itertools as it
# Packages needed

In [111]:
# some helpers
def fillna(col):
    if col.value_counts().index[0] == '1':
        col.fillna(col.value_counts().index[1], inplace=True) # ensure we don't fill heterozygous
    else:
        col.fillna(col.value_counts().index[0], inplace=True)
    return col

def imputeData(df):
    df = df.replace('*', np.NaN)
    #df = df.astype('int64')
    #imputer = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent')
    #return pd.DataFrame(imputer.fit_transform(df))
    return df.apply(lambda col:fillna(col))

def readAndImpute(file):
    data = pd.read_csv(file, sep = " ", header = None)
    data = imputeData(data)
    data = data.astype('int64')
    return data.values.tolist()

def basic_phaser(genotype):
    h1, h2 = [], []
    for g in genotype:
        if g == 0:
            h1.append(0)
            h2.append(0)
        if g == 2:
            h1.append(1)
            h2.append(1)
        if g == 1:
            break
            h1.append(0)
            h2.append(1)
    return h1, h2

def add_haplos(h1, h2):
    #print(h1, h2)
    a = [h1[i] + h2[i] for i in range(len(h1))]
    return a # assume h1 and h2 same len

def difference(g,h):
    d = [g[i]-h[i] for i in range(len(g))]
    return d

def valid_haplo(h):
    for s in h:
        if s != 0 and s != 1:
            return False
    return True

def h2_from_g_and_h1(g, known):
    for h1 in known:
        h2 = difference(g, h1)
        if valid_haplo(h2):
            return h1, h2
    return [],[]

In [173]:
ex1 = readAndImpute("assignment/example_data_2_masked.txt")
# ex1 is currently a list of lists
# 1 list per SNP, each list has 50 entries (that individual's status at that SNP)

In [169]:
def clarks(genotypes):
    numSNPS = len(genotypes)
    numIndivs = len(genotypes[0])
    
    
    # need to get pool of known haplotypes
    def fill_known_haps(df):
        nSNPs = len(df)
        nIndividuals = len(df[0])
        haplotypes = np.zeros((nSNPs, 2*nIndividuals), dtype = np.int)
        haplotypes.fill(-1)
        known = []
        for i in range(nIndividuals):
            genotype = [row[i] for row in df]
            h1, h2 = basic_phaser(genotype)
            if len(h1) == len(h2) == len(genotype):
                if h1 not in known:
                    known.append(h1)
                if h2 not in known:
                    known.append(h2)
                for SNP in range(nSNPs):
                    haplotypes[SNP][2*i] = h1[SNP]
                    haplotypes[SNP][2*i + 1] = h2[SNP]
        return haplotypes, known
    
    haplotypes, known = fill_known_haps(genotypes)
    
    def hash_haplo_combos(known):
        combos = {}
        for pair in list(it.combinations(known, 2)):
            combos[str(add_haplos(pair[0], pair[1]))] = [pair[0],pair[1]]
        return combos 
    
    known_combos = hash_haplo_combos(known)
    for i in range(7):
        for i in range(numIndivs):
            genotype = [row[i] for row in genotypes]
            genotype_str = str(genotype)
            if genotype_str in known_combos:
                h1, h2 = known_combos[genotype_str]
                for index in range(numSNPS):
                    haplotypes[index][2*i] = h1[index]
                    haplotypes[index][2*i+1] = h2[index]
            else:
                h1, h2 = h2_from_g_and_h1(genotype, known)
                if len(h1) > 0 and len(h2) > 0:
                    for index in range(numSNPS):
                        haplotypes[index][2*i] = h1[index]
                        haplotypes[index][2*i+1] = h2[index]
                    known.append(h2)
                    known_combos = hash_haplo_combos(known)

    haplotypes = guessUnphased(genotypes, haplotypes)
    return haplotypes

In [160]:
for i in range(50):
    for row in ex1[0:4]:
        print("row", i, "is",row[i])

row 0 is 0
row 0 is 2
row 0 is 2
row 0 is 2
row 1 is 1
row 1 is 2
row 1 is 2
row 1 is 2
row 2 is 2
row 2 is 1
row 2 is 2
row 2 is 2
row 3 is 2
row 3 is 1
row 3 is 1
row 3 is 2
row 4 is 1
row 4 is 1
row 4 is 2
row 4 is 2
row 5 is 1
row 5 is 2
row 5 is 1
row 5 is 2
row 6 is 2
row 6 is 1
row 6 is 2
row 6 is 2
row 7 is 2
row 7 is 2
row 7 is 2
row 7 is 2
row 8 is 1
row 8 is 2
row 8 is 2
row 8 is 2
row 9 is 2
row 9 is 2
row 9 is 1
row 9 is 2
row 10 is 2
row 10 is 2
row 10 is 2
row 10 is 2
row 11 is 1
row 11 is 2
row 11 is 1
row 11 is 2
row 12 is 1
row 12 is 2
row 12 is 1
row 12 is 2
row 13 is 2
row 13 is 1
row 13 is 2
row 13 is 1
row 14 is 1
row 14 is 2
row 14 is 1
row 14 is 1
row 15 is 1
row 15 is 2
row 15 is 2
row 15 is 2
row 16 is 0
row 16 is 2
row 16 is 1
row 16 is 2
row 17 is 1
row 17 is 2
row 17 is 1
row 17 is 2
row 18 is 0
row 18 is 1
row 18 is 1
row 18 is 2
row 19 is 1
row 19 is 2
row 19 is 2
row 19 is 2
row 20 is 2
row 20 is 2
row 20 is 2
row 20 is 2
row 21 is 1
row 21 is 2
row 21 i

In [88]:
small = pd.DataFrame(clarks(ex1[0:5]))
small[0][1]



1

In [18]:
def guessUnphased(df, haplotypes):
    nSNPs = len(df)
    nIndiv = len(df[0])
    for i in range(nIndiv):
        for j in range(nSNPs):
            
            if haplotypes[j][2*i] == -1 and haplotypes[j][2*i+1] == -1:
                if df[j][i] == 0:
                    haplotypes[j][2*i], haplotypes[j][2*i+1] = 0,0
                elif df[j][i] == 1:
                    haplotypes[j][2*i], haplotypes[j][2*i+1] = 1,0
                elif df[j][i] == 2:
                    haplotypes[j][2*i], haplotypes[j][2*i+1] = 1,1
                    
            elif haplotypes[j][2*i] == -1:
                if haplotypes[j][2*i+1] == 0:
                    if df[j][i] == 0:
                        haplotypes[j][2*i] = 0
                    elif df[j][i] == 1:
                        haplotypes[j][2*i] = 1
                    elif df[j][i] == 2:
                        # error
                        haplotypes[j][i] = 1
                elif haplotypes[j][2*i+1] == 1:
                    if df[j][i] == 0:
                        # error
                        haplotypes[j][2*i] = 1
                    elif df[j][i] == 1:
                        haplotypes[j][2*i] = 0
                    elif df[j][i] == 2:
                        haplotypes[j][i] = 1
            
            elif haplotypes[j][2*i+1] == -1:
                if haplotypes[j][2*i] == 0:
                    if df[j][i] == 0:
                        haplotypes[j][2*i+1] = 0
                    elif df[j][i] == 1:
                        haplotypes[j][2*i+1] = 1
                    elif df[j][i] == 2:
                        # error
                        haplotypes[j][2*i+1] = 1
                elif haplotypes[j][2*i] == 1:
                    if df[j][i] == 0:
                        # error
                        haplotypes[j][2*i+1] = 1
                    elif df[j][i] == 1:
                        haplotypes[j][2*i+1] = 0
                    elif df[j][i] == 2:
                        haplotypes[j][i+1] = 1
                
    return haplotypes

In [17]:
def chunking(genotypes, split_size):
    if len(genotypes) % split_size == 0:
            numChunks = (int)(len(genotypes)/split_size)
    else:
        numChunks = (int)(len(genotypes)/split_size) + 1
    return numChunks

In [174]:
#main
genots = ex1
split_size = 5
numChunks = chunking(genots, split_size)
haplos = []
for i in range(numChunks):
    data = genots[i*split_size:(i+1)*split_size]
    if i == 0:
        haplos = clarks(data)
    elif i == numChunks - 1 and len(genots[i*split_size]) > 0:
        final = genots[i*split_size:]
        final_numSNPs = len(final)
        final_numIndivs = len(final[0])
        final_haplo_chunk = np.zeros((final_numSNPs, 2*final_numIndivs), dtype = np.int)
        final_haplo_chunk.fill(-1)
        final_haplo_chunk = guessUnphased(final, final_haplo_chunk)
        haplos = np.concatenate((haplos, final_haplo_chunk), axis = 0)
    else:
        haplos = np.concatenate((haplos, clarks(data)), axis = 0)
        

In [175]:
check = pd.DataFrame(haplos)
np.savetxt('../test_data_1_my_sol.txt', haplos, fmt='%i', delimiter = ' ')

In [123]:
sol1 = add_haplos(check[0], check[1])

In [138]:
soltrue = pd.read_csv("assignment/example_data_1_sol.txt", header = None, sep = " ")
soltrue = soltrue.astype('int64')

In [139]:
soltrue0 = add_haplos(soltrue[0],soltrue[1])

In [147]:
soltrue0 == sol1

[0,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 0,
 1,
 0,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 0,
 2,
 1,
 2,
 0,
 1,
 2,
 1,
 2,
 1,
 1,
 0,
 2,
 0,
 2,
 2]

In [149]:
soltrue0 == ex1

AttributeError: 'list' object has no attribute 'iloc'