In [4]:
# CS 124: Machine Learning in Genetics
# Project: Haplotype Phaser
# Contributors: Aditya Pimplaskar, Aditya Joglekar
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [19]:
# Imputation functions
def fillna(col):
    if col.value_counts().index[0] == '1':
        col.fillna(col.value_counts().index[1], inplace=True) # ensure we don't fill heterozygous
    else:
        col.fillna(col.value_counts().index[0], inplace=True)
    return col

def imputeData(df):
    df = df.replace('*', np.NaN)
    #df = df.astype('int64')
    #imputer = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent')
    #return pd.DataFrame(imputer.fit_transform(df))
    return df.apply(lambda col:fillna(col))


def deleteDups(g):
    # goal is to eliminate duplicate genotypes/duplicate haplotypes
    g = g.drop_duplicates(inplace=True)
    return g

In [6]:
# Compatibility checker function
def checkPhase(g, h1, h2):
    # want to see if element wise sum of h1 and h2 is g
    # takes list g of SNPs
    # takes lists h1, h2 of SNPs
    import numpy as np
    g = np.array(g)
    h1 = np.array(h1)
    h2 = np.array(h2)
    comparison =  (h1 + h2 == g)
    return comparison.all()

def findDifference(g, h1): 
    # so that we can fill in a new haplo if we don't find a compatibile pair in Clark's
    g = np.array(g)
    h1 = np.array(h1)
    return g - h1


In [None]:
def clarks(genotypes):
    # need to give a starting pool
    # we can do this by phasing all of the deterministic genotypes
    genotypes = genotypes.astype('int64')
    haplotypes = []
    
    toDrop = [] # deterministic
    for ind in range(len(genotypes)):
        h = []
        g = genotypes.iloc[ind]
        for i in range(len(g)):
            if g[i] == 1: #non deterministic
                break
            if g[i] == 0:
                h.append(0)
            if g[i] == 2:
                h.append(1)
        if len(h) == len(g): # did you make it to the end of the string
            haplotypes.append(h)
            toDrop.append(ind) #thins out our new genotype list
    genotypes = genotypes.drop(genotypes.index[toDrop])
    
    for i,g in genotypes.iterrows():
        phased = False #flag variable
        for h1 in range(len(haplotypes)):
            for h2 in range(h1, len(haplotypes)):
                if checkPhase(g,haplotypes[h1],haplotypes[h2]): # we already have the phase accounted for
                    phased = True
        if phased == False: # now we need to add a haplo that works
            for h in haplotypes:
                diff = findDifference(g, h)
                # now just need to make sure this difference has no weird values -- i.e. is a valid addition
                if sum(0 <= x <= 1 for x in diff) == len(g):
                    haplotypes.append(h)
                    break
    
    return haplotypes

In [8]:
def wrapper(genotypes):
    # do it all
    genotypes = genotypes.T
    genotypes = imputeData(genotypes) # impute
    genotypes = genotypes.astype('int64')
    deleteDups(genotypes) # get rid of duplicates
    return clarks(genotypes)

In [53]:
ex1 = pd.read_csv("assignment/example_data_1_masked.txt", sep = " ", header=None)
pd.set_option('display.max_columns', 100)

In [54]:
ex1 = imputeData(ex1)
ex1 = ex1.T

In [55]:
ex1 = ex1.astype('int64')
ex1.shape

(50, 39496)

In [56]:
deleteDups(ex1)
ex1.shape


(50, 39496)

In [57]:
len(ex1.iloc[0])

39496

In [58]:
haplotypes = []
    
toDrop = [] # deterministic
for ind in range(len(ex1)):
    h = []
    g = ex1.iloc[ind]
    for i in range(len(g)):
        if g[i] == 1: #non deterministic
            break
        if g[i] == 0:
            h.append(0)
        if g[i] == 2:
            h.append(1)
    if len(h) == len(g): # did you make it to the end of the string
        haplotypes.append(h)
        toDrop.append(ind) #thins out our new genotype list
ex1 = ex1.drop(ex1.index[toDrop])

In [59]:
len(haplotypes)

0