In [7]:
# CS 124: Machine Learning in Genetics
# Project: Haplotype Phaser
# Contributors: Aditya Pimplaskar, Aditya Joglekar
import numpy as np
import pandas as pd

# Packages needed

In [66]:
# Imputation functions + helpers
def fillna(col):
    if col.value_counts().index[0] == '1':
        col.fillna(col.value_counts().index[1], inplace=True) # ensure we don't fill heterozygous
    else:
        col.fillna(col.value_counts().index[0], inplace=True)
    return col

def imputeData(df):
    df = df.replace('*', np.NaN)
    #df = df.astype('int64')
    #imputer = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent')
    #return pd.DataFrame(imputer.fit_transform(df))
    return df.apply(lambda col:fillna(col))

def splitDF(df, numPieces): 
    return np.array_split(df, numPieces)
    
def imputeData2(df, chunkSize):
    # in: genotype data frame
    # out: list of sub-dfs (chunkSize x individuals) 
                # or (chunkSize-1 x individuals), with individuals on the columns, SNPs on the rows
    df = df.replace('*', np.NaN)
    splits = splitDF(df, len(df) // chunkSize + 1) # splits data frame into pieces of length 100
    for dfi in splits:
        dfi = dfi.T
        dfi.apply(lambda col:fillna(col)) # fill with most common SNP value for that chunk
    return splits #returns list of split up data frames

def deleteDups(g):
    # goal is to eliminate duplicate genotypes/duplicate haplotypes
    g.drop_duplicates(inplace=True)
    return g

def listToString(l):
    l = [str(i) for i in l]
    return ''.join(l)

def stringToListOfInts(s):
    l = list(s)
    l = [int(i) for i in l]
    return l
    
def getMostFrequentHaplo(d): 
    # input is the counts dictionary
    return stringToListOfInts(max(d, key=d.get))

In [45]:
# Compatibility checker function
def checkPhase(g, h1, h2):
    # want to see if element wise sum of h1 and h2 is g
    # takes list g of SNPs
    # takes lists h1, h2 of SNPs
    import numpy as np
    g = np.array(g)
    h1 = np.array(h1)
    h2 = np.array(h2)
    comparison =  (h1 + h2 == g)
    return comparison.all()

def findDifference(g, h1): 
    # so that we can fill in a new haplo if we don't find a compatibile pair in Clark's
    g = np.array(g)
    h1 = np.array(h1)
    return (g - h1).tolist()



In [89]:
ex1 = pd.read_csv("assignment/example_data_1_masked.txt", sep = " ", header = None)
imputeTest = imputeData2(ex1, 5)

In [87]:
def clarksSplit(genotypes):
    #input: list of split up dataframes
    #output: list of numPeople elements
        # for each person -- a list of tuples of potential phases OR a tuple for the phase we have assigned??
    #pseudocode
        # for each subdf sdf
            # transpose sdf so that the SNPs are on the columns
            # cast to int64
            # for each individual (row) in sdf
                # check for deterministic phasing
                # add those deterministic phasings to haplotype pool
            # for each individual (row) in sdf
                # clarks happens here
                # if two haplotypes in haplotype pool phase that row, add that phase as a tuple to output
                    #increment each of their count
                # else
                    # add a tuple of (haplotype, genotype - haplotype) to output so long as g-h is valid
                    # add the g-h to the haplotype pool
                    # g-h count += 1, h count += 1
                    
    
        # dictionary of list casted to string and count of that haplotype
    output = [[] for i in range(len(genotypes[0].columns))] 
        # just a way of getting the right number of elements in output
    last_index_seen = 0
        
    for df_index in range(len(genotypes)):
        print("starting sub-dataframe", df_index)
        sdf = genotypes[df_index] # get sdf
        sdf = sdf.T
        sdf = sdf.astype('int64')
        
        # keep track of these
        haplotypes = []
        haplotype_counts = {}
        phases = [[] for i in range(len(genotypes[0].columns))] # list of list of tuples (phases) for each individual
        
        # adding deterministic haplotypes to the haplotype pool
        for individual, genot in sdf.iterrows():
            print("deterministic: looking at individual", individual)
            h = []
            #genot = sdf.iloc[individual]
            genot = list(genot)
            for i in range(len(genot)): # for each SNP
                adjusted_i = i 
                if genot[adjusted_i] == 1:
                    break
                if genot[adjusted_i] == 0:
                    h.append(0)
                if genot[adjusted_i] == 2:
                    h.append(1)
            if len(h) == len(genot): # valid
                haplotypes.append(h)
            
    
        for individual_again, to_phase in sdf.iterrows():
            print("clarks: looking at individual", individual_again)
            individuals_phases = []
            #to_phase = sdf[individual_again]
            to_phase = list(to_phase)
            phased = False #flag
            for haplo_index1 in range(len(haplotypes)):
                # check if check if h1, h1 phases
                h1 = haplotypes[haplo_index1]
                if checkPhase(to_phase, h1, h1):
                    individuals_phases.append((h1, h1))
                    phased = True
                    
                    #updating counts table
                    h1_string = listToString(h1) #for hash table of counts
                    if h1_string not in haplotype_counts:
                        haplotype_counts[h1_string] = 2
                    else: 
                        haplotype_counts[h1_string] += 2
                
                for haplo_index2 in range(haplo_index1, len(haplotypes)):
                    h2 = haplotypes[haplo_index2]
                    if checkPhase(to_phase, h1, h2):
                        individuals_phases.append((h1, h2))
                        phased = True
                        
                        #updating counts table
                        h1_string = listToString(h1)
                        h2_string = listToString(h2)
                        if h1_string not in haplotype_counts:
                            haplotype_counts[h1_string] = 1
                        else: 
                            haplotype_counts[h1_string] += 1
                        if h2_string not in haplotype_counts:
                            haplotype_counts[h2_string] = 1
                        else: 
                            haplotype_counts[h2_string] += 1
                    
            if phased == False:
                for haplo_index in range(len(haplotypes)):
                    h = haplotypes[haplo_index]
                    diff = findDifference(to_phase, h)
                    if sum(0 <= x <= 1 for x in diff) == len(to_phase): #valid
                        diff_string = listToString(diff)
                        h_string = listToString(h)
                        
                        #updating counts table
                        if diff_string not in haplotype_counts:
                            haplotype_counts[diff_string] = 1
                        else: 
                            haplotype_counts[diff_string] += 1
                        if h_string not in haplotype_counts:
                            haplotype_counts[h_string] = 1
                        else: 
                            haplotype_counts[h_string] += 1
                        
                        break
            phases[individual_again] = individuals_phases
            
        # now to consolidate into one phase per individual for each of these subdf
        for individual_c in range(len(phases)):
            print("individual adding to output", individual_c)
            max_count = 0
            max_index = 0
            for phase_index in range(len(individuals_phases)):
                phasing = individuals_phases[phase_index]
                if haplotype_counts[listToString(phasing[0])] > max_count or haplotype_counts[listToString(phasing[1])] > max_count:
                    max_index = phase_index
                
            print("max index is", max_index)
            output[individual_c].append(individuals_phases[max_index])
                
                    
                    
    return output
    

In [40]:
a = [1,1,1]
listToString(a)

'111'

In [44]:
h = []
genot = [1,2,2,2,2]
for i in range(len(genot)): # for each SNP
    adjusted_i = i
    if genot[adjusted_i] == 1:
        break
    if genot[adjusted_i] == 0:
        h.append(0)
    if genot[adjusted_i] == 2:
        h.append(1)
if len(h) == len(genot): # valid
    print(h)

    


In [53]:
b = {1:2, 3:4, 5:6}
if 2 in b:
    b[3] += 1
else:
    b[2] = 0

b

{1: 2, 2: 0, 3: 4, 5: 6}

In [55]:
a = (2,3)
3 in a

True

In [64]:
c = "123"
list(c)
c = [int(s) for s in c]
c

[1, 2, 3]

In [65]:
max(b, key=b.get)

5

In [67]:
d = {"111": 2, "222": 1, "000": 3, "100": 7}
getMostFrequentHaplo(d)

[1, 0, 0]

In [76]:
check = [[],[],[]]
for i, g in pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]]).iterrows():
    check[i] = list(g)

check

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [90]:
trial = clarksSplit(imputeTest)

starting sub-dataframe 0
deterministic: looking at individual 0
deterministic: looking at individual 1
deterministic: looking at individual 2
deterministic: looking at individual 3
deterministic: looking at individual 4
deterministic: looking at individual 5
deterministic: looking at individual 6
deterministic: looking at individual 7
deterministic: looking at individual 8
deterministic: looking at individual 9
deterministic: looking at individual 10
deterministic: looking at individual 11
deterministic: looking at individual 12
deterministic: looking at individual 13
deterministic: looking at individual 14
deterministic: looking at individual 15
deterministic: looking at individual 16
deterministic: looking at individual 17
deterministic: looking at individual 18
deterministic: looking at individual 19
deterministic: looking at individual 20
deterministic: looking at individual 21
deterministic: looking at individual 22
deterministic: looking at individual 23
deterministic: looking at

IndexError: list index out of range