In [1]:
# CS 124: Machine Learning in Genetics
# Project: Haplotype Phaser
# Contributors: Aditya Pimplaskar, Aditya Joglekar
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [2]:
# Imputation functions
def fillna(col):
    col.fillna(col.value_counts().index[0], inplace=True)
    return col

def imputeData(df):
    df = df.replace('*', np.NaN)
    #imputer = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent')
    #return pd.DataFrame(imputer.fit_transform(df))
    return df.apply(lambda col:fillna(col))


def deleteDups(g):
    # goal is to eliminate duplicate genotypes/duplicate haplotypes
    g = g.drop_duplicates(inplace=True)
    return g

In [3]:
# Compatibility checker function
def checkPhase(g, h1, h2):
    # want to see if element wise sum of h1 and h2 is g
    import numpy as np
    g = np.array(g)
    h1 = np.array(h1)
    h2 = np.array(h2)
    comparison =  (h1 + h2 == g)
    return comparison.all()

def findDifference(g, h1): 
    # so that we can fill in a new haplo if we don't find a compatibile pair in Clark's
    g = np.array(g)
    h1 = np.array(h1)
    return g - h1


In [4]:
def clarks(genotypes, outfile):
    # need to give a starting pool
    # we can do this by phasing all of the deterministic genotypes
    
    haplotypes = []
    
    toDrop = [] # deterministic
    for ind in range(len(genotypes)):
        h = []
        g = genotypes.iloc[ind]
        for i in range(len(g)):
            if g[i] == 1: #non deterministic
                break
            if g[i] == 0:
                h.append(0)
            if g[i] == 2:
                h.append(1)
        if len(h) == len(g): # did you make it to the end of the string
            haplotypes.append(h)
            toDrop.append(ind) #thins out our new genotype list
    genotypes = genotypes.drop(genotypes.index[toDrop])
    
    for i,g in genotypes.iterrows():
        phased = False #flag variable
        for h1 in range(len(haplotypes)):
            for h2 in range(h1, len(haplotypes)):
                if checkPhase(g,haplotypes[h1],haplotypes[h2]): # we already have the phase accounted for
                    phased = True
        if phased == False: # now we need to add a haplo that works
            for h in haplotypes:
                diff = findDifference(g, h)
                # now just need to make sure this difference has no weird values -- i.e. is a valid addition
                if sum(0 <= x <= 1 for x in diff) == len(g):
                    haplotypes.append(h)
                    break
    
    return haplotypes

In [34]:
def wrapper(genotypes):
    # do it all
    genotypes = genotypes.T
    genotypes = imputeData(genotypes) # impute
    genotypes = genotypes.astype('int64')
    deleteDups(genotypes) # get rid of duplicates
    return clarks(genotypes)

In [42]:
ex1 = pd.read_csv("assignment/test_data_masked.txt", sep = " ", header=None)
pd.set_option('display.max_columns', 50)
ex1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0,0,0,*,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,*,0,0,0,*,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,*,0,2,0,0,0,1,0,1,2,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,1,1,*,1
2,2,*,*,2,2,*,2,2,2,2,2,2,2,2,*,1,2,2,2,2,2,1,2,1,*,2,*,2,2,2,2,*,2,2,1,2,2,1,2,2,2,2,2,2,1,2,2,1,2,2
3,2,2,*,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,1,1,2,2,2,*,2,2,2,2,2,*,2,2,*,*,2,2,2
4,2,*,2,2,2,2,2,1,2,2,2,2,2,1,2,*,2,2,2,2,1,2,2,*,2,1,1,1,1,2,2,2,1,2,1,1,2,1,1,2,1,2,2,2,1,2,2,2,2,2
5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,*,*,0,0,0,*,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
6,1,2,2,2,2,1,0,2,2,2,1,2,2,2,0,2,1,1,2,2,1,*,2,2,1,1,*,2,2,*,1,2,2,2,2,1,2,2,2,0,2,1,1,2,2,1,0,1,1,2
7,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,*,2,2,2,2,2,2,2,2,2,2,2,*,2,2,2,2,2
8,0,0,0,0,0,0,*,1,0,0,*,0,*,1,0,0,0,0,0,0,1,0,0,0,*,1,1,1,1,*,*,0,1,0,1,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0
9,2,*,2,2,2,2,2,2,2,2,*,2,2,2,2,2,2,2,1,2,*,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,1,2,*,2,2,2,2,2,2,2,2,2


In [43]:
ex1_imp = imputeData(ex1.T)
ex1_imp = ex1_imp.astype('int64')
ex1_imp.dtypes
num = 0
for i, r in ex1_imp.iterrows():  
    if sum(0 <= x <= 1 for x in r) == len(r):
        num = num + 1
num

0

In [16]:

haplotypes = []
toDrop = [] # deterministic
for ind in range(len(ex1_imp)):
    h = []
    g = ex1_imp.iloc[ind]
    for i in g:
        if i == 1: #non deterministic
            break
        if i == 0:
            h.append(0)
        if i == 2:
            h.append(1)
    if len(h) == len(g): # did you make it to the end of the string
        haplotypes.append(h)
        toDrop.append(ind) #thins out our new genotype list
ex1_imp = ex1_imp.drop(ex1_imp.index[toDrop])

In [18]:
toDrop

[]

In [23]:
t = [0,0,0,0,1,1,1,0,1]
sum(0 <= x <= 1 for x in t) == len(t)


True

In [35]:
wrapper(ex1)

[]