In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random

In [2]:
df = pd.read_csv('new_mutations.csv')

In [3]:
myclass = df['class']

In [4]:
df = df.drop('class',axis=1).T

In [5]:
df.columns = myclass

In [6]:
df.head(1)

class,NC1,C1,NC2,NC3,NC4,NC5,NC6,C2,C3,NC7,...,NC135,NC136,NC137,NC138,NC139,NC140,C107,NC141,C108,NC142
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
c = [i for i in df.columns if i[0]=='C']
nc = [j for j in df.columns if j[0]=='N']
people = list(df.columns)

In [8]:
# Helper functions

def intersection(lst1, lst2):
    '''
    This function finds the intersection of two lists
    '''
    
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def feature(data, grp):
    '''
    this function finds the best feature within a group of individuals based on TP-FP
    '''
    return(data[intersection(grp,c)].sum(axis=1)\
           -data[intersection(grp,nc)].sum(axis=1)).sort_values(ascending=False).head(1)

def classifier(data, individual, root_node, leaf_node_1, leaf_node_2):
    if data[individual].loc[root_node]==1:
        if data[individual].loc[leaf_node_1]==1:
            return "C"
        else:
            return "NC"
    else:
        if data[individual].loc[leaf_node_2]==1:
            return "C"
        else:
            return "NC" 

def evaluator(data, root_node, leaf_node_1, leaf_node_2, classifier, individuals):
    TN = FN = TP = FP = 0
    for i in range(len(individuals)):
        if individuals[i][0] == classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="C":
            TP +=1
        elif individuals[i][0:2] == classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="NC":
            TN +=1
        elif individuals[i][0]=="N" and classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="C":
            FP +=1
        elif individuals[i][0]=="C" and classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="NC":
            FN +=1
            
    return [TP, TN, FP, FN]

def metrics_calc(conf_mat):
    # Format of conf_mat should be: [TP, TN, FP, FN]
    TP = conf_mat[0]
    TN = conf_mat[1]
    FP = conf_mat[2]
    FN = conf_mat[3]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    sensitivity = TP/(TP+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP+FP)
    miss_rate = FN/(FN+TP)
    false_discovery_rate = FP/(FP+TP)
    false_omission_rate = FN/(FN+TN)
    return f""" 
    Accuracy is {accuracy:.2}
    Sensitivity is {sensitivity:.2}
    Specificity is {specificity:.2}
    Precision is {precision:.2}
    Miss Rate is {miss_rate:.2}
    False discovery rate is {false_discovery_rate:.2} 
    False omission rate is {false_omission_rate:.2} 
    """

In [9]:
random.shuffle(list(people))
fold_1 = df[people[0:int(len(people)/3)]]
fold_2 = df[people[int(len(people)/3):int(len(people)*2/3)]]
fold_3 = df[people[int(len(people)*2/3):int(len(people))]]

In [10]:
train_1 = pd.concat([fold_1,fold_2], axis = 1)
test_1 = fold_3

train_2 = pd.concat([fold_2,fold_3], axis = 1)
test_2 = fold_1

train_3 = pd.concat([fold_3,fold_1], axis = 1)
test_3 = fold_2

In [11]:
# First
people_1 = list(train_1.columns)
feature(train_1, people_1)

RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--    16
dtype: int64

In [12]:
leaf_1 = []
leaf_2 = []
for i in range(len(people_1)):
    if df[people_1[i]]['RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--']==1:
        leaf_1.append(people_1[i])
    else:
        leaf_2.append(people_1[i])

In [13]:
feature(train_1.drop('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--', axis=0), leaf_1)

ZFP91_GRCh37_11:58386428-58386428_3'UTR_DEL_T-T--    7
dtype: int64

In [14]:
feature(train_1, leaf_2)

KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G    10
dtype: int64

In [15]:
evaluator(data=test_1, 
          root_node="RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--",
          leaf_node_1="ZFP91_GRCh37_11:58386428-58386428_3'UTR_DEL_T-T--",
          leaf_node_2="KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G",
          classifier=classifier,
          individuals=list(test_1.columns)
         )

[2, 51, 2, 29]

In [16]:
print(metrics_calc(evaluator(data=test_1, 
          root_node="RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--",
          leaf_node_1="ZFP91_GRCh37_11:58386428-58386428_3'UTR_DEL_T-T--",
          leaf_node_2="KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G",
          classifier=classifier,
          individuals=list(test_1.columns)
                            )
                  )
     )

 
    Accuracy is 0.63
    Sensitivity is 0.065
    Specificity is 0.96
    Precision is 0.5
    Miss Rate is 0.94
    False discovery rate is 0.5 
    False omission rate is 0.36 
    


In [17]:
#Second

In [18]:
people_2 = list(train_2.columns)
root_node = feature(train_2, people_2).index[0]

In [19]:
leaf_1 = []
leaf_2 = []
for i in range(len(people_2)):
    if df[people_2[i]]["DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--"]==1:
        leaf_1.append(people_2[i])
    else:
        leaf_2.append(people_2[i])

In [20]:
leaf_node_1 = feature(train_2.drop(root_node, axis=0), leaf_1).index[0]
leaf_node_2 = feature(train_2, leaf_2).index[0]
print('root node is : ',root_node,'\n','leaf nodes are :', leaf_node_1, '\n',leaf_node_2)

root node is :  DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- 
 leaf nodes are : RALB_GRCh37_2:121052255-121052255_3'UTR_DEL_T-T-- 
 RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--


In [21]:
print(metrics_calc(evaluator(data=test_2, 
          root_node=root_node,
          leaf_node_1=leaf_node_1,
          leaf_node_2=leaf_node_2,
          classifier=classifier,
          individuals=list(test_2.columns)
                            )
                  )
     )

 
    Accuracy is 0.69
    Sensitivity is 0.26
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.74
    False discovery rate is 0.0 
    False omission rate is 0.35 
    


In [22]:
#Third

In [23]:
people_3 = list(train_3.columns)
root_node = feature(train_3, people_3).index[0]

In [24]:
leaf_1 = []
leaf_2 = []
pp = people_3
for i in range(len(pp)):
    if df[pp[i]][root_node]==1:
        leaf_1.append(pp[i])
    else:
        leaf_2.append(pp[i])

In [25]:
leaf_node_1 = feature(train_3.drop(root_node, axis=0), leaf_1).index[0]
leaf_node_2 = feature(train_3, leaf_2).index[0]
print('root node is : ',root_node,'\n','leaf nodes are :', leaf_node_1, '\n',leaf_node_2)

root node is :  RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T-- 
 leaf nodes are : DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C 
 DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


In [26]:
print(metrics_calc(evaluator(data=test_3, 
          root_node=root_node,
          leaf_node_1=leaf_node_1,
          leaf_node_2=leaf_node_2,
          classifier=classifier,
          individuals=list(test_3.columns)
                            )
                  )
     )

 
    Accuracy is 0.58
    Sensitivity is 0.17
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.83
    False discovery rate is 0.0 
    False omission rate is 0.46 
    


In [27]:
avg_acc = (.58+.69+.63)/3
avg_sen = (.17+.26+.065)/3
avg_spe = (1+1+.96)/3

In [28]:
print(avg_acc, avg_sen, avg_spe)

0.6333333333333333 0.165 0.9866666666666667


In [29]:
people[:5]

['NC1', 'C1', 'NC2', 'NC3', 'NC4']

In [30]:
all_mut = list(df.index)

In [31]:
len(all_mut)

8085

In [32]:
# PHI Function

In [33]:
feature_table = {
    'n(tL)' : 8085*[0],
    'n(tR)' : 8085*[0],
    'n(tL,C)' : 8085*[0],
    'n(tL,NC)' : 8085*[0],
    'n(tR,C)' : 8085*[0],
    'n(tR,NC)' : 8085*[0],
    'PL' : 8085*[0],
    'PR' : 8085*[0],
    'P(C|tL)' :8085*[0],
    'P(NC|tL)' :8085*[0],
    'P(C|tR)' : 8085*[0],
    'P(NC|tR)' : 8085*[0],
    '2PLPR' : 8085*[0],
    'Q(s|t)' : 8085*[0],
    'Phi(s,t)':8085*[0],
}

In [34]:
data = pd.DataFrame(feature_table, index = all_mut)

In [35]:
data.head(1)

Unnamed: 0,n(tL),n(tR),"n(tL,C)","n(tL,NC)","n(tR,C)","n(tR,NC)",PL,PR,P(C|tL),P(NC|tL),P(C|tR),P(NC|tR),2PLPR,Q(s|t),"Phi(s,t)"
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
len(people), len(c), len(nc)

(250, 108, 142)

In [37]:
# part 1
p_ct = len(c)/len(people)
p_nct = len(nc)/len(people)

print('P(C,t): ', round(p_ct,2), '\n', 'P(NC,t): ', round(p_nct,2))

P(C,t):  0.43 
 P(NC,t):  0.57


In [38]:
# Solution #1
#pp = people
#leftpp = rightpp = []
#for j in all_mut:
#    for i in range(len(pp)):
#        if df[pp[i]][j]==1:
#            data['n(tL)'].loc[j]+=1
#            leftpp.append(pp[i])
#            if pp[i][0]=='C':
#data['n(tL,C)'].loc[j]+=1
#            else:
#                data['n(tL,NC)'].loc[j]+=1
#        else:
#            data['n(tR)'].loc[j]+=1
#            rightpp.append(pp[i])
#            if pp[i][0]=='C':
#                data['n(tR,C)'].loc[j]+=1
#            else:
#                data['n(tR,NC)'].loc[j]+=1

In [39]:
#data.sort_values(by='n(tL)', ascending=False)

In [40]:
data['PL'] = data['n(tL)']/len(people)
data['PR'] = data['n(tR)']/len(people)
data['P(C|tL)'] = data['n(tL,C)']/data['n(tL)']
data['P(NC|tL)'] = data['n(tL,NC)']/data['n(tL)']
data['P(C|tR)'] = data['n(tR,C)']/data['n(tR)']
data['P(NC|tR)'] = data['n(tR,NC)']/data['n(tR)']
data['2PLPR'] = 2 * data['PL'] * data['PR']
data['Q(s|t)'] = abs(data['P(C|tL)']-data['P(C|tR)']) + abs(data['P(NC|tL)']-data['P(NC|tR)'])
data['Phi(s,t)'] = data['2PLPR'] * data['Q(s|t)']

In [41]:
data.sort_values(by='Phi(s,t)', ascending=False).head(10)

Unnamed: 0,n(tL),n(tR),"n(tL,C)","n(tL,NC)","n(tR,C)","n(tR,NC)",PL,PR,P(C|tL),P(NC|tL),P(C|tR),P(NC|tR),2PLPR,Q(s|t),"Phi(s,t)"
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
TEX36_GRCh37_10:127371546-127371546_Nonsense-Mutation_SNP_G-G-A,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
KIAA1217_GRCh37_10:24810824-24810824_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
SIRT1_GRCh37_10:69676051-69676051_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
MYPN_GRCh37_10:69881845-69881845_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del_DEL_T-T--,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
NPAT_GRCh37_11:108032094-108032094_Missense-Mutation_SNP_G-G-T_G-G-A,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
ARNTL_GRCh37_11:13402723-13402723_Silent_SNP_G-G-A,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
COPB1_GRCh37_11:14480092-14480092_Missense-Mutation_SNP_G-G-A_G-G-T,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,
PDE3B_GRCh37_11:14882800-14882800_Missense-Mutation_SNP_G-G-A,0,0,0,0,0,0,0.0,0.0,,,,,0.0,,


In [42]:
def phi_finder(df):
    
    # table of all cancer individual who has not a specific mutation
    ntrc = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    # table of all cancer individual who has a specific mutation
    ntlc = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)

    # table of all noncancer individual who has not a specific mutation
    ntrnc = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    # table of all noncancer individual who has a specific mutation
    ntlnc = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)

    # table of cancer and noncancer has specific mutation
    ntl = ntlc + ntlnc
    # table of cancer and noncancer has not a specific mutation
    ntr = ntrc + ntrnc
    # table of calculated phi value to find the feature with highest phi
    phi = (2*ntl*ntr*(1/len(ntr+ntl)**2)*(abs((ntlc/ntl)-(ntrc/ntr)) + abs((ntlnc/ntl)-(ntrnc/ntr))))
    
    return phi

In [43]:
def decison_tree(func,df):
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]
    
    leaf_node_1 = phi_finder(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = phi_finder(df2).sort_values(ascending=False).head(1).index[0]
    #f' Root Node is: {root_node}, Leaf Node #1 is: {leaf_node_1}, Leaf Node #2 is: {leaf_node_2}'
    return root_node, leaf_node_1, leaf_node_2

In [44]:
decison_tree(phi_finder, df)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 'FAM13B_GRCh37_5:137278834-137278834_Frame-Shift-Del_DEL_A-A--',
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--')

In [45]:
# find mutation with highest phi and store in root_node
root_node = phi_finder(df).sort_values(ascending=False).head(1).index[0]

In [46]:
root_node

'RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--'

In [47]:
df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
df1 = df1.drop(root_node)
df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]

In [48]:
leaf_node_1 = phi_finder(df1).sort_values(ascending=False).head(1).index[0]
leaf_node_2 = phi_finder(df2).sort_values(ascending=False).head(1).index[0]

In [49]:
leaf_node_1 , leaf_node_2 

('FAM13B_GRCh37_5:137278834-137278834_Frame-Shift-Del_DEL_A-A--',
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--')

In [50]:
df3 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==1)]]
df3 = df3.drop(leaf_node_1)
df4 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==0)]]

df5 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==1)]]
df5 = df5.drop(leaf_node_2)
df6 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==0)]]

In [57]:
df3.head(1)

class,NC49
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,0


In [None]:
def classifier(func, df, individual, root_node, leaf_node_1, leaf_node_2):
    
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]
    
    leaf_node_1 = func(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = func(df2).sort_values(ascending=False).head(1).index[0]
    
    df3 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==1)]]
    df3 = df3.drop(leaf_node_1)
    df4 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==0)]]

    df5 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==1)]]
    df5 = df5.drop(leaf_node_2)
    df6 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==0)]]
    
    if df[individual].loc[root_node]==1:
        if df[individual].loc[leaf_node_1]==1:
            c1 = [i for i in df3.columns if i[0]=='C']
            nc1 = [j for j in df3.columns if j[0]=='N']
            if len(c1)>len(nc1):
                return "C"
            else:
                return "NC"
        else:
            c2 = [i for i in df4.columns if i[0]=='C']
            nc2 = [j for j in df4.columns if j[0]=='N']
            if len(c2)>len(nc2):
                return "C"
            else:
                return "NC"
    else:
        if df[individual].loc[leaf_node_2]==1:
            c3 = [i for i in df5.columns if i[0]=='C']
            nc3 = [j for j in df5.columns if j[0]=='N']
            if len(c3)>len(nc3):
                return "C"
            else:
                return "NC"
        else:
            c4 = [i for i in df6.columns if i[0]=='C']
            nc4 = [j for j in df6.columns if j[0]=='N']
            if len(c4)>len(nc4):
                return "C"
            else:
                return "NC"

In [59]:
df4.head(1)

class,C3,C5,C11,C13,C15,C18,C21,C24,C26,C27,...,C78,C83,C86,C90,C91,C95,C97,C100,C103,C105
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
leaf_node_3 = phi_finder(df3).sort_values(ascending=False).head(1).index[0]
leaf_node_4 = phi_finder(df4).sort_values(ascending=False).head(1).index[0]
leaf_node_5 = phi_finder(df5).sort_values(ascending=False).head(1).index[0]
leaf_node_6 = phi_finder(df6).sort_values(ascending=False).head(1).index[0]

In [53]:
leaf_node_3 ,leaf_node_4 ,leaf_node_5 ,leaf_node_6 

('GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T',
 'LPXN_GRCh37_11:58331628-58331628_Missense-Mutation_SNP_C-C-A',
 'GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T',
 'PPP2R1A_GRCh37_19:52715971-52715971_Missense-Mutation_SNP_C-C-G_C-C-T')

In [54]:
avg_acc = avg_sen = avg_spe = avg_pre = avg_mis = avg_fdr = avg_for = []

def classifier(data, individual, root_node, leaf_node_1, leaf_node_2):
    if data[individual].loc[root_node]==1:
        if data[individual].loc[leaf_node_1]==1:
            return "C"
        else:
            return "NC"
    else:
        if data[individual].loc[leaf_node_2]==1:
            return "C"
        else:
            return "NC" 

def evaluator(data, root_node, leaf_node_1, leaf_node_2, classifier, individuals):
    TN = FN = TP = FP = 0
    for i in range(len(individuals)):
        if individuals[i][0] == classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="C":
            TP +=1
        elif individuals[i][0:2] == classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="NC":
            TN +=1
        elif individuals[i][0]=="N" and classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="C":
            FP +=1
        elif individuals[i][0]=="C" and classifier(data,individuals[i],root_node, leaf_node_1, leaf_node_2)=="NC":
            FN +=1
            
    return [TP, TN, FP, FN]

def metrics_calc(conf_mat):
    # Format of conf_mat should be: [TP, TN, FP, FN]
    TP = conf_mat[0]
    TN = conf_mat[1]
    FP = conf_mat[2]
    FN = conf_mat[3]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    sensitivity = TP/(TP+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP+FP)
    miss_rate = FN/(FN+TP)
    false_discovery_rate = FP/(FP+TP)
    false_omission_rate = FN/(FN+TN)
    return f""" 
    Accuracy is {accuracy:.2}
    Sensitivity is {sensitivity:.2}
    Specificity is {specificity:.2}
    Precision is {precision:.2}
    Miss Rate is {miss_rate:.2}
    False discovery rate is {false_discovery_rate:.2} 
    False omission rate is {false_omission_rate:.2} 
    """

def phi_finder(df):
    
    # table of all cancer individual who has not a specific mutation
    ntrc = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    # table of all cancer individual who has a specific mutation
    ntlc = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)

    # table of all noncancer individual who has not a specific mutation
    ntrnc = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    # table of all noncancer individual who has a specific mutation
    ntlnc = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)

    # table of cancer and noncancer has specific mutation
    ntl = ntlc + ntlnc
    # table of cancer and noncancer has not a specific mutation
    ntr = ntrc + ntrnc
    # table of calculated phi value to find the feature with highest phi
    phi = (2*ntl*ntr*(1/len(ntr+ntl)**2)*(abs((ntlc/ntl)-(ntrc/ntr)) + abs((ntlnc/ntl)-(ntrnc/ntr))))
    
    return phi

def decison_tree(func,df):
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]
    
    leaf_node_1 = phi_finder(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = phi_finder(df2).sort_values(ascending=False).head(1).index[0]
    #f' Root Node is: {root_node}, Leaf Node #1 is: {leaf_node_1}, Leaf Node #2 is: {leaf_node_2}'
    return root_node, leaf_node_1, leaf_node_2

In [55]:
random.shuffle(list(people))
fold_1 = df[people[0:int(len(people)/3)]]
fold_2 = df[people[int(len(people)/3):int(len(people)*2/3)]]
fold_3 = df[people[int(len(people)*2/3):int(len(people))]]

train_1 = pd.concat([fold_1,fold_2], axis = 1)
test_1 = fold_3

train_2 = pd.concat([fold_2,fold_3], axis = 1)
test_2 = fold_1

train_3 = pd.concat([fold_3,fold_1], axis = 1)
test_3 = fold_2