In [7]:
# CS 124: Machine Learning in Genetics
# Project: Haplotype Phaser
# Contributors: Aditya Pimplaskar, Aditya Joglekar
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [40]:
# Imputation functions
def fillna(col):
    if col.value_counts().index[0] == '1':
        col.fillna(col.value_counts().index[1], inplace=True) # ensure we don't fill heterozygous
    else:
        col.fillna(col.value_counts().index[0], inplace=True)
    return col

def imputeData(df):
    df = df.replace('*', np.NaN)
    #df = df.astype('int64')
    #imputer = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent')
    #return pd.DataFrame(imputer.fit_transform(df))
    return df.apply(lambda col:fillna(col))


def deleteDups(g):
    # goal is to eliminate duplicate genotypes/duplicate haplotypes
    g = g.drop_duplicates(inplace=True)
    return g

In [3]:
# Compatibility checker function
def checkPhase(g, h1, h2):
    # want to see if element wise sum of h1 and h2 is g
    # takes list g of SNPs
    # takes lists h1, h2 of SNPs
    import numpy as np
    g = np.array(g)
    h1 = np.array(h1)
    h2 = np.array(h2)
    comparison =  (h1 + h2 == g)
    return comparison.all()

def findDifference(g, h1): 
    # so that we can fill in a new haplo if we don't find a compatibile pair in Clark's
    g = np.array(g)
    h1 = np.array(h1)
    return g - h1


In [4]:
def clarks(genotypes):
    # need to give a starting pool
    # we can do this by phasing all of the deterministic genotypes
    
    haplotypes = []
    
    toDrop = [] # deterministic
    for ind in range(len(genotypes)):
        h = []
        g = genotypes.iloc[ind]
        for i in range(len(g)):
            if g[i] == 1: #non deterministic
                break
            if g[i] == 0:
                h.append(0)
            if g[i] == 2:
                h.append(1)
        if len(h) == len(g): # did you make it to the end of the string
            haplotypes.append(h)
            toDrop.append(ind) #thins out our new genotype list
    genotypes = genotypes.drop(genotypes.index[toDrop])
    
    for i,g in genotypes.iterrows():
        phased = False #flag variable
        for h1 in range(len(haplotypes)):
            for h2 in range(h1, len(haplotypes)):
                if checkPhase(g,haplotypes[h1],haplotypes[h2]): # we already have the phase accounted for
                    phased = True
        if phased == False: # now we need to add a haplo that works
            for h in haplotypes:
                diff = findDifference(g, h)
                # now just need to make sure this difference has no weird values -- i.e. is a valid addition
                if sum(0 <= x <= 1 for x in diff) == len(g):
                    haplotypes.append(h)
                    break
    
    return haplotypes

In [25]:
def wrapper(genotypes):
    # do it all
    genotypes = genotypes.T
    genotypes = imputeData(genotypes) # impute
    genotypes = genotypes.astype('int64')
    deleteDups(genotypes) # get rid of duplicates
    return clarks(genotypes)

In [42]:
ex1 = pd.read_csv("assignment/example_data_1_masked.txt", sep = " ", header=None)
pd.set_option('display.max_columns', 100)
ex1.iloc[0]

0     0
1     1
2     2
3     2
4     1
5     1
6     *
7     2
8     1
9     2
10    *
11    1
12    1
13    2
14    1
15    1
16    0
17    1
18    *
19    1
20    *
21    1
22    2
23    0
24    2
25    1
26    1
27    2
28    *
29    2
30    2
31    2
32    1
33    2
34    0
35    2
36    1
37    2
38    0
39    1
40    2
41    1
42    *
43    1
44    1
45    0
46    2
47    0
48    2
49    2
Name: 0, dtype: object

In [44]:
ex1 = ex1.T

In [45]:
ex1 = imputeData(ex1)
ex1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,39446,39447,39448,39449,39450,39451,39452,39453,39454,39455,39456,39457,39458,39459,39460,39461,39462,39463,39464,39465,39466,39467,39468,39469,39470,39471,39472,39473,39474,39475,39476,39477,39478,39479,39480,39481,39482,39483,39484,39485,39486,39487,39488,39489,39490,39491,39492,39493,39494,39495
0,0,2,2,2,2,0,0,2,2,2,0,2,0,0,2,0,0,0,0,2,1,0,2,0,2,2,2,0,2,0,0,0,2,0,0,1,0,0,2,2,2,2,1,1,2,0,2,2,2,1,...,0,0,2,1,2,2,0,0,2,0,1,2,2,0,1,2,1,0,1,1,1,1,1,0,0,1,1,2,1,1,0,1,2,0,0,2,2,1,0,0,1,0,1,0,2,1,2,1,0,2
1,1,2,2,2,2,0,0,2,2,0,1,1,0,1,2,0,0,1,0,2,2,0,1,0,2,2,2,0,2,1,0,0,2,0,0,1,0,2,2,2,2,2,2,2,2,0,2,2,1,2,...,0,2,2,0,2,2,2,0,2,0,2,1,1,0,1,1,0,0,2,1,0,1,1,1,0,1,2,2,1,0,0,0,2,0,0,2,2,1,0,1,1,1,0,0,2,0,1,1,1,1
2,2,1,2,2,2,0,0,1,2,0,0,2,0,1,2,0,0,0,1,2,1,0,2,0,2,2,2,0,1,0,0,0,2,0,0,0,0,0,2,2,2,2,2,2,2,0,2,2,2,2,...,0,0,2,1,2,1,0,1,1,1,1,1,1,0,2,1,1,0,1,0,1,0,0,0,0,0,2,2,0,0,0,2,2,0,0,2,2,0,0,0,2,0,1,0,2,1,2,1,0,2
3,2,1,1,2,2,0,0,2,2,0,0,2,0,0,2,0,0,0,0,2,2,0,2,0,2,2,2,1,1,0,0,0,2,0,0,1,0,0,2,2,2,2,1,1,2,0,1,2,1,1,...,1,0,2,1,1,1,0,0,1,2,0,2,2,0,0,2,0,0,2,1,0,1,2,0,0,1,2,2,1,2,0,0,1,0,0,1,1,1,0,2,1,1,2,0,2,2,1,2,2,1
4,1,1,2,2,2,0,0,2,2,0,0,2,0,1,2,0,0,0,1,1,2,0,0,0,2,2,2,0,2,0,0,0,2,0,0,0,0,1,2,2,2,2,2,2,2,0,2,2,1,2,...,0,0,2,1,2,2,0,0,2,1,0,1,1,0,1,2,1,0,1,1,1,1,1,0,0,1,1,2,1,1,0,1,1,0,0,2,1,0,0,1,1,1,1,0,2,1,1,1,1,2
5,1,2,1,2,2,0,0,2,2,1,0,2,0,0,2,0,1,0,0,2,2,0,2,0,2,2,2,1,2,0,0,0,2,0,0,1,0,0,2,2,2,2,1,1,2,0,2,2,2,1,...,0,0,2,1,2,1,0,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,1,2,2,0,1,1,0,0,0,0,0,2,0,0,2,0,0,2,0,2,2,1,2,1,1
6,2,1,2,2,2,0,0,2,2,0,0,2,0,0,2,0,0,0,0,1,2,1,1,0,2,2,2,0,2,0,0,0,2,0,1,1,0,0,1,2,2,2,1,1,2,0,2,2,1,1,...,0,1,2,0,1,2,2,1,2,1,1,2,1,0,0,2,0,0,2,1,0,1,1,1,0,1,2,1,2,0,0,1,1,0,0,1,0,1,0,0,2,0,1,1,2,1,2,1,0,2
7,2,2,2,2,2,0,0,2,2,1,0,2,0,0,2,0,0,0,0,2,2,1,2,0,2,2,2,0,2,0,0,0,2,0,0,1,0,0,2,2,2,2,1,1,2,0,2,2,2,1,...,0,0,2,2,2,1,0,1,2,1,1,1,1,0,0,2,0,0,2,2,0,2,1,0,0,1,1,2,2,2,1,0,2,0,0,2,2,0,0,1,0,1,2,0,2,1,2,1,0,2
8,1,2,2,2,2,0,0,2,2,0,0,2,0,0,2,0,0,0,0,2,2,0,2,0,2,2,2,0,2,0,0,0,2,0,1,1,0,0,2,2,2,2,1,1,2,0,1,2,1,1,...,0,0,2,1,1,1,0,0,2,1,1,1,1,0,1,2,0,0,2,2,0,2,1,0,0,1,2,2,1,2,0,1,1,0,0,1,1,1,0,0,2,0,0,0,2,0,2,0,0,2
9,2,2,1,2,2,0,0,2,2,0,0,2,0,0,2,0,0,0,0,2,1,0,2,0,2,2,2,0,2,0,0,0,2,0,0,0,0,0,2,2,2,2,2,1,2,0,2,2,2,2,...,1,0,2,0,0,0,0,0,2,2,0,1,1,0,1,2,0,0,2,2,0,2,2,0,0,1,2,2,2,2,0,0,0,0,0,1,1,2,0,1,2,2,2,0,2,2,1,2,1,1


(50, 39496)

In [20]:
haplotypes

[]