In [69]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import itertools

In [70]:
df = pd.read_csv('new_mutations.csv')
myclass = df['class']
df = df.drop('class',axis=1).T
df.columns = myclass
people = list(df.columns)
c = [i for i in df.columns if i[0]=='C']
nc = [j for j in df.columns if j[0]=='N']
df.shape, len(c), len(nc)

((8085, 250), 108, 142)

In [71]:
# Main Functions
def shuffle_copy(x):
        b = x[:] # make a copy of the keys
        random.shuffle(b) # shuffle the copy
        return b # return the copy

avg_acc = [[],[],[]]
avg_sen = [[],[],[]]
avg_spe = [[],[],[]]
avg_pre = [[],[],[]]
avg_mis = [[],[],[]]
avg_fdr = [[],[],[]]
avg_for = [[],[],[]]

def metrics_calc(conf_mat,experiment):
    # Format of conf_mat should be: [TP, TN, FP, FN]
    TP = conf_mat[0]
    TN = conf_mat[1]
    FP = conf_mat[2]
    FN = conf_mat[3]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    sensitivity = TP/(TP+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP+FP)
    miss_rate = FN/(FN+TP)
    false_discovery_rate = FP/(FP+TP)
    false_omission_rate = FN/(FN+TN)
    global avg_acc
    global avg_sen
    global avg_spe, avg_pre, avg_mis, avg_fdr, avg_for
    avg_acc[experiment].append(accuracy)
    avg_sen[experiment].append(sensitivity)
    avg_spe[experiment].append(specificity)
    avg_pre[experiment].append(precision)
    avg_mis[experiment].append(miss_rate)
    avg_fdr[experiment].append(false_discovery_rate)
    avg_for[experiment].append(false_omission_rate)
    return f""" 
    Accuracy is {accuracy:.2}
    Sensitivity is {sensitivity:.2}
    Specificity is {specificity:.2}
    Precision is {precision:.2}
    Miss Rate is {miss_rate:.2}
    False discovery rate is {false_discovery_rate:.2} 
    False omission rate is {false_omission_rate:.2} 
    """

def tpfp_finder(df):
    tp = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)
    fp = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)
    tn = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    fn = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    
    tpfp = tp - fp
    return tpfp


def phi_finder(df):
    
    # table of all cancer individual who has not a specific mutation
    ntrc = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    # table of all cancer individual who has a specific mutation
    ntlc = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)

    # table of all noncancer individual who has not a specific mutation
    ntrnc = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    # table of all noncancer individual who has a specific mutation
    ntlnc = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)

    # table of cancer and noncancer has specific mutation
    ntl = ntlc + ntlnc
    # table of cancer and noncancer has not a specific mutation
    ntr = ntrc + ntrnc
    # table of calculated phi value to find the feature with highest phi
    phi = 2*ntl*ntr*(1/(ntr+ntl)**2)*(abs((ntlc/ntl)-(ntrc/ntr)) + abs((ntlnc/ntl)-(ntrnc/ntr)))
    
    return phi

def gain_finder(df):
    
    # table of all cancer individual who has not a specific mutation
    ntrc = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    # table of all cancer individual who has a specific mutation
    ntlc = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)

    # table of all noncancer individual who has not a specific mutation
    ntrnc = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    # table of all noncancer individual who has a specific mutation
    ntlnc = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)

    # table of cancer and noncancer has specific mutation
    ntl = ntlc + ntlnc
    # table of cancer and noncancer has not a specific mutation
    ntr = ntrc + ntrnc
    # total of samples
    nt = ntr + ntl
    # table of calculated gain value to find the feature with highest gain
    ntc = ntrc + ntlc
    ntnc = ntrnc + ntlnc
    
    pc = ntc/nt
    pnc = ntnc/nt
    ht = -(pc*np.log2(1+pc)+pnc*np.log2(1+pnc))
    
    pl = ntl/nt
    pr = ntr/nt
    plc = ntlc/(ntl+1)
    plnc = ntlnc/(ntl+1)
    prc = ntrc/ntr
    prnc = ntrnc/ntr
    htr = -(prc*np.log2(1+prc)+prnc*np.log2(1+prnc))
    htl = -(plc*np.log2(1+plc)+plnc*np.log2(1+plnc))
    hst = pl*htl + pr*htr
    gain = ht - hst
    
    
    return gain

def decision_tree(func,df): # df here is training dataset
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]
    
    leaf_node_1 = func(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = func(df2).sort_values(ascending=False).head(1).index[0]
    
    df3 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==1)]]
    df3 = df3.drop(leaf_node_1)
    c1 = [i for i in df3.columns if i[0]=='C']
    nc1 = [j for j in df3.columns if j[0]=='N']
    
    df4 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==0)]]
    c2 = [i for i in df4.columns if i[0]=='C']
    nc2 = [j for j in df4.columns if j[0]=='N']
    
    df5 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==1)]]
    df5 = df5.drop(leaf_node_2)
    c3 = [i for i in df5.columns if i[0]=='C']
    nc3 = [j for j in df5.columns if j[0]=='N']
    
    df6 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==0)]]
    c4 = [i for i in df6.columns if i[0]=='C']
    nc4 = [j for j in df6.columns if j[0]=='N']
    
    con1 = len(c1)>len(nc1)
    con2 = len(c2)>len(nc2)
    con3 = len(c3)>len(nc3)
    con4 = len(c4)>len(nc4)
    return root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4

def decision_tree_2(func,df): # df here is training dataset
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1)]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0)]]
    
    leaf_node_1 = func(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = func(df2).sort_values(ascending=False).head(1).index[0]
    
    con1=True
    con2=False
    con3=True
    con4=False
    
    return root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4

def evaluator(df, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4):
    # df here is test dataset
    TN = FN = TP = FP = 0
    individuals = list(df.columns)
    for i in range(df.shape[1]):
        if individuals[i][0] == classifier(df, individuals[i], root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4)=="C":
            TP +=1
        elif individuals[i][0:2] == classifier(df, individuals[i], root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4)=="NC":
            TN +=1
        elif individuals[i][0]=="N" and classifier(df, individuals[i], root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4)=="C":
            FP +=1
        elif individuals[i][0]=="C" and classifier(df, individuals[i], root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4)=="NC":
            FN +=1
            
    return [TP, TN, FP, FN]


def classifier(df, individual, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4):    
    # df here is test dataset
    if df[individual].loc[root_node]==1:
        if df[individual].loc[leaf_node_1]==1:
            return "C" if con1 else "NC"
        else:
            return "C" if con2 else "NC"
    else:
        if df[individual].loc[leaf_node_2]==1:
            return "C" if con3 else "NC"
        else:
            return "C" if con4 else "NC"

def average_exp(experiment):
    
    acct = sum(avg_acc[experiment])/len(avg_acc[experiment])
    sent = sum(avg_sen[experiment])/len(avg_sen[experiment])
    spet = sum(avg_spe[experiment])/len(avg_spe[experiment])
    pret = sum(avg_pre[experiment])/len(avg_pre[experiment])
    mist = sum(avg_mis[experiment])/len(avg_mis[experiment])
    fdrt = sum(avg_fdr[experiment])/len(avg_fdr[experiment])
    fort = sum(avg_for[experiment])/len(avg_for[experiment])
    mylist = [acct, sent, spet, pret, mist, fdrt, fort]
    
    print(f"""
    Avg of Accuracy is {sum(avg_acc[experiment])/len(avg_acc[experiment])}
    Avg of Sensitivity is {sum(avg_sen[experiment])/len(avg_sen[experiment])}
    Avg of Specificity is {sum(avg_spe[experiment])/len(avg_spe[experiment])}
    Avg of Precision is {sum(avg_pre[experiment])/len(avg_pre[experiment])}
    Avg of Miss Rate is {sum(avg_mis[experiment])/len(avg_mis[experiment])}
    Avg of False discovery rate is {sum(avg_fdr[experiment])/len(avg_fdr[experiment])}
    Avg of False omission rate is {sum(avg_for[experiment])/len(avg_for[experiment])}
    """)

In [72]:
people = shuffle_copy(people) 

In [73]:
#random.shuffle(list(people))

fold_1 = df[people[0:int(len(people)/3)]]
fold_2 = df[people[int(len(people)/3):int(len(people)*2/3)]]
fold_3 = df[people[int(len(people)*2/3):int(len(people))]]

train_1 = pd.concat([fold_1,fold_2], axis = 1)
test_1 = fold_3

train_2 = pd.concat([fold_2,fold_3], axis = 1)
test_2 = fold_1

train_3 = pd.concat([fold_3,fold_1], axis = 1)
test_3 = fold_2

In [74]:
fold_1.shape, fold_2.shape, fold_3.shape

((8085, 83), (8085, 83), (8085, 84))

In [75]:
## TP - FP

In [76]:
decision_tree_2(tpfp_finder,train_1)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C",
 'RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--',
 True,
 False,
 True,
 False)

In [77]:
decision_tree_2(tpfp_finder,train_2)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "BTBD7_GRCh37_14:93708031-93708031_3'UTR_DEL_A-A--",
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--',
 True,
 False,
 True,
 False)

In [78]:
decision_tree_2(tpfp_finder,train_3)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "CSNK1G1_GRCh37_15:64461260-64461260_3'UTR_DEL_A-A--",
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--',
 True,
 False,
 True,
 False)

In [79]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree_2(tpfp_finder, train_1)
print(metrics_calc(evaluator(
    test_1, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),0))

 
    Accuracy is 0.68
    Sensitivity is 0.18
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.82
    False discovery rate is 0.0 
    False omission rate is 0.35 
    


In [80]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree_2(tpfp_finder, train_2)
print(metrics_calc(evaluator(
    test_2, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),0))

 
    Accuracy is 0.65
    Sensitivity is 0.19
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.81
    False discovery rate is 0.0 
    False omission rate is 0.38 
    


In [81]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree_2(tpfp_finder, train_3)
print(metrics_calc(evaluator(
    test_3, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),0))

 
    Accuracy is 0.57
    Sensitivity is 0.077
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.92
    False discovery rate is 0.0 
    False omission rate is 0.45 
    


In [82]:
average_exp(experiment=0)


    Avg of Accuracy is 0.6318129661503155
    Avg of Sensitivity is 0.15106190106190107
    Avg of Specificity is 1.0
    Avg of Precision is 1.0
    Avg of Miss Rate is 0.848938098938099
    Avg of False discovery rate is 0.0
    Avg of False omission rate is 0.39257759784075574
    


In [83]:
## Phi 

In [84]:
decision_tree(phi_finder, train_1)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 'USP25_GRCh37_21:17236657-17236657_Missense-Mutation_SNP_G-G-A',
 'RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--',
 False,
 True,
 True,
 False)

In [85]:
decision_tree(phi_finder, train_2)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "CAPS2_GRCh37_12:75669988-75669988_3'UTR_DEL_T-T--",
 'PPP2R1A_GRCh37_19:52715971-52715971_Missense-Mutation_SNP_C-C-G_C-C-T',
 False,
 True,
 False,
 False)

In [86]:
decision_tree(phi_finder, train_3)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 'LPXN_GRCh37_11:58331628-58331628_Missense-Mutation_SNP_C-C-A',
 'RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--',
 True,
 True,
 True,
 False)

In [87]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(phi_finder, train_1)
print(metrics_calc(evaluator(
    test_1, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),1))

 
    Accuracy is 0.76
    Sensitivity is 0.39
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.61
    False discovery rate is 0.0 
    False omission rate is 0.28 
    


In [88]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(phi_finder, train_2)
print(metrics_calc(evaluator(
    test_2, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),1))

 
    Accuracy is 0.65
    Sensitivity is 0.19
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.81
    False discovery rate is 0.0 
    False omission rate is 0.38 
    


In [89]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(phi_finder, train_3)
print(metrics_calc(evaluator(
    test_3, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),1))

 
    Accuracy is 0.66
    Sensitivity is 0.31
    Specificity is 0.98
    Precision is 0.92
    Miss Rate is 0.69
    False discovery rate is 0.077 
    False omission rate is 0.39 
    


In [90]:
average_exp(experiment=1)


    Avg of Accuracy is 0.6917192579843183
    Avg of Sensitivity is 0.2986920486920487
    Avg of Specificity is 0.9924242424242425
    Avg of Precision is 0.9743589743589745
    Avg of Miss Rate is 0.7013079513079514
    Avg of False discovery rate is 0.025641025641025644
    Avg of False omission rate is 0.3496611246425924
    


In [91]:
## Gain

In [92]:
decision_tree(gain_finder,train_1)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 'UBE3C_GRCh37_7:156975019-156975019_Silent_SNP_T-T-C',
 'RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--',
 False,
 True,
 True,
 False)

In [93]:
decision_tree(gain_finder,train_2)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "RUNX1T1_GRCh37_8:92972382-92972383_3'UTR_INS_----A",
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--',
 False,
 True,
 True,
 False)

In [94]:
decision_tree(gain_finder,train_2)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "RUNX1T1_GRCh37_8:92972382-92972383_3'UTR_INS_----A",
 'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--',
 False,
 True,
 True,
 False)

In [95]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(gain_finder, train_1)
print(metrics_calc(evaluator(
    test_1, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),2))

 
    Accuracy is 0.76
    Sensitivity is 0.39
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.61
    False discovery rate is 0.0 
    False omission rate is 0.28 
    


In [96]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(gain_finder, train_2)
print(metrics_calc(evaluator(
    test_2, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),2))

 
    Accuracy is 0.72
    Sensitivity is 0.36
    Specificity is 1.0
    Precision is 1.0
    Miss Rate is 0.64
    False discovery rate is 0.0 
    False omission rate is 0.33 
    


In [97]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = decision_tree(gain_finder, train_3)
print(metrics_calc(evaluator(
    test_3, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),2))

 
    Accuracy is 0.67
    Sensitivity is 0.33
    Specificity is 0.98
    Precision is 0.93
    Miss Rate is 0.67
    False discovery rate is 0.071 
    False omission rate is 0.38 
    


In [98]:
average_exp(experiment=2)


    Avg of Accuracy is 0.719831707783515
    Avg of Sensitivity is 0.3627946127946127
    Avg of Specificity is 0.9924242424242425
    Avg of Precision is 0.9761904761904763
    Avg of Miss Rate is 0.6372053872053871
    Avg of False discovery rate is 0.023809523809523808
    Avg of False omission rate is 0.32902438787313254
    


In [None]:
# Accuracy: imbalance data, coin is 0.5
# Sensitivity : among ind with cancer, model correctly identified only 4 % of ind with cancer, 
# 96 % has cancer but identified as no cancer FN 
# Specificity : anong ind without cancer, model correctly identified 92 % of ind without cancer,
# only 8 % no cancer but not identified
# Precision --> when model predicts that an individual has cancer, it is correct 47% of the time,
# 53 % of the time ind has no cancer but model mistakenly identified as cancer FP
# Miss rate -> a miss rate of 0.95 indicates that my model misses (false negatives) 95% of the
# individuals with cancer.
# False discovery rate -> meaning that 52% of the individuals it predicted as having cancer were
# actually negative cases.
# False omission rate -> indicating that it incorrectly classified 44% of individuals with cancer
# as not having cancer. positive cases

## Demo 5

In [None]:
def gain_finder_table(df):
    
    # table of all cancer individual who has not a specific mutation
    ntrc = (df[[i for i in df.columns if i[0]=='C']]==0).sum(axis=1)
    # table of all cancer individual who has a specific mutation
    ntlc = (df[[i for i in df.columns if i[0]=='C']]==1).sum(axis=1)

    # table of all noncancer individual who has not a specific mutation
    ntrnc = (df[[i for i in df.columns if i[0]=='N']]==0).sum(axis=1)
    # table of all noncancer individual who has a specific mutation
    ntlnc = (df[[i for i in df.columns if i[0]=='N']]==1).sum(axis=1)

    # table of cancer and noncancer has specific mutation
    ntl = ntlc + ntlnc
    # table of cancer and noncancer has not a specific mutation
    ntr = ntrc + ntrnc
    # total of samples
    nt = ntr + ntl
    # table of calculated gain value to find the feature with highest gain
    ntc = ntrc + ntlc
    ntnc = ntrnc + ntlnc
    
    pc = ntc/nt
    pnc = ntnc/nt
    ht = -(pc*np.log2(1+pc)+pnc*np.log2(1+pnc))
    
    pl = ntl/nt
    pr = ntr/nt
    plc = ntlc/(ntl+1)
    plnc = ntlnc/(ntl+1)
    prc = ntrc/ntr
    prnc = ntrnc/ntr
    htr = -(prc*np.log2(1+prc)+prnc*np.log2(1+prnc))
    htl = -(plc*np.log2(1+plc)+plnc*np.log2(1+plnc))
    hst = pl*htl + pr*htr
    gain = ht - hst
    
    list_1 = [ntl,ntr,ntlc,ntlnc,ntrc,ntrnc,pl,pr,plc,plnc,prc,prnc,htl,htr,hst,gain]
    rr = pd.concat(list_1,axis=1)
    rr.columns = [ "n(tL)","n(tR)", "n(tL,C)", "n(tL,NC)", "n(tR,C)", "n(tR,NC)",
                  "PL", "PR","PL,Ct", "PL,NCt", "PR,Ct", "PR,NCt","HtL", "HtR","H(s,t)", "gain(s)"]
    return rr

In [99]:
import math
import numpy

In [109]:
def random_forest(func,df): # df here is training dataset
    data = df.copy()
    df = df.sample(int(df.shape[0]), replace=True)
    df_out = data.loc[[i for i in data.index if i not in df.index]]
    
    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1).any()]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0).any()]]
    df1 = df1.sample(math.ceil(df1.shape[0]**0.5))
    df2 = df2.sample(math.ceil(df2.shape[0]**0.5))
    leaf_node_1 = func(df1).sort_values(ascending=False).head(1).index[0]
    leaf_node_2 = func(df2).sort_values(ascending=False).head(1).index[0]
    
    df3 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==1).any()]]
    df3 = df3.drop(leaf_node_1)
    df3 = df3.sample(math.ceil(df3.shape[0]**0.5))
    c1 = [i for i in df3.columns if i[0]=='C']
    nc1 = [j for j in df3.columns if j[0]=='N']
    
    df4 = df1[[ j for j in df1.columns if (df1[j].loc[leaf_node_1]==0).any()]]
    df4 = df4.sample(math.ceil(df4.shape[0]**0.5))
    c2 = [i for i in df4.columns if i[0]=='C']
    nc2 = [j for j in df4.columns if j[0]=='N']
    
    df5 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==1).any()]]
    df5 = df5.drop(leaf_node_2)
    df5 = df5.sample(math.ceil(df5.shape[0]**0.5))
    c3 = [i for i in df5.columns if i[0]=='C']
    nc3 = [j for j in df5.columns if j[0]=='N']
    
    df6 = df2[[ j for j in df2.columns if (df2[j].loc[leaf_node_2]==0).any()]]
    df6 = df6.sample(math.ceil(df6.shape[0]**0.5))
    c4 = [i for i in df6.columns if i[0]=='C']
    nc4 = [j for j in df6.columns if j[0]=='N']
    
    con1 = len(c1)>len(nc1)
    con2 = len(c2)>len(nc2)
    con3 = len(c3)>len(nc3)
    con4 = len(c4)>len(nc4)
    return root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4

In [110]:
random_forest(phi_finder, df)

('RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--',
 "SHC4_GRCh37_15:49117722-49117722_3'UTR_DEL_T-T--",
 'PTEN_GRCh37_10:89692904-89692904_Missense-Mutation_SNP_C-C-G',
 True,
 True,
 True,
 False)

In [111]:
random_forest(gain_finder, df)

('DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--',
 'HERC2P2_GRCh37_15:23308615-23308615_RNA_SNP_G-G-A',
 'HSD3B1_GRCh37_1:120049947-120049947_Intron_DEL_A-A--',
 False,
 True,
 True,
 False)

In [113]:
root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4 = random_forest(gain_finder, train_3)
print(metrics_calc(evaluator(
    test_3, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4),2))

 
    Accuracy is 0.6
    Sensitivity is 0.18
    Specificity is 0.98
    Precision is 0.88
    Miss Rate is 0.82
    False discovery rate is 0.12 
    False omission rate is 0.43 
    


In [115]:
print(evaluator(test_3, classifier, root_node, leaf_node_1, leaf_node_2, con1, con2, con3, con4))

[7, 43, 1, 32]


In [65]:
#1 bootstrap -> sample with with return
#2 out of the bag -> for loop over all samples
#3 apply gain method on each tree considering sqn of features and store root and leaf nodes in a list
#4 apply on out of baggage data to find the most common one and store as tp,fp,tn,or fn
#5 compute performance metrics

In [42]:
my_list = [1, 2, 3, 4, 2, 2, 3, 3, 3, 4, 4, 4, 4]

# Find the element with the maximum number of repetitions
most_common_element = max(set(my_list), key=my_list.count)

print("Most common element:", most_common_element)
print("Number of repetitions:", my_list.count(most_common_element))


Most common element: 4
Number of repetitions: 5


In [43]:
ml = ["true", "false", "false", "true", "false"]

In [44]:
most_common_element = max(set(ml), key=ml.count)
print("Most common element:", most_common_element)
print("Number of repetitions:", ml.count(most_common_element))

Most common element: false
Number of repetitions: 3


In [195]:
#nodes = [''  for j in range(2**max_depth-1)]
#conditions = [ 0  for j in range(2**max_depth)]
max_depth = 2
nodes = [[] for index in range(max_depth+1)]
conditions = []

In [196]:
nodes, conditions

([[], [], []], [])

In [197]:
def decision_tree_3(func,df,max_depth,depth=0): # df here is training dataset
    if depth > max_depth:
        return nodes, conditions
        
    df = df.sample(math.ceil(df.shape[0]**0.5))

    root_node = func(df).sort_values(ascending=False).head(1).index[0]
    nodes[depth].append(root_node)
    df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1).any()]]
    df1 = df1.drop(root_node)
    df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0).any()]]
    df1 = df1.sample(math.ceil(df1.shape[0]**0.5))
    df2 = df2.sample(math.ceil(df2.shape[0]**0.5))
    
    if depth==max_depth:
        c1 = [i for i in df1.columns if i[0]=='C']
        nc1 = [j for j in df1.columns if j[0]=='N']
        c2 = [i for i in df2.columns if i[0]=='C']
        nc2 = [j for j in df2.columns if j[0]=='N']
        con1 = len(c1)>len(nc1)
        con2 = len(c2)>len(nc2)
        conditions.append(con1)
        conditions.append(con2)
    leaf_node_1 = decision_tree_3(func,df1,max_depth,depth+1)
    leaf_node_2 = decision_tree_3(func,df2,max_depth,depth+1)

In [198]:
decision_tree_3(gain_finder,df,2)

In [199]:
nodes

[['PTEN_GRCh37_10:89692904-89692904_Missense-Mutation_SNP_C-C-G'],
 ['KCND3_GRCh37_1:112323370-112323370_Missense-Mutation_SNP_G-G-A',
  "KIAA0355_GRCh37_19:34844830-34844830_3'UTR_DEL_A-A--"],
 ['SACS_GRCh37_13:23912385-23912385_Missense-Mutation_SNP_C-C-T',
  'LRIT3_GRCh37_4:110790809-110790809_Nonsense-Mutation_SNP_G-G-T',
  'EXD2_GRCh37_14:69701577-69701577_Missense-Mutation_SNP_G-G-A',
  "C17orf66_GRCh37_17:34182059-34182059_3'UTR_SNP_G-G-T"]]

In [200]:
conditions

[False, False, False, True, False, True, True, False]

In [201]:
max_depth = 1
nodes = [[] for index in range(max_depth+1)]
conditions = []
decision_tree_3(gain_finder,df,1)

In [202]:
nodes

[["DDHD1_GRCh37_14:53513450-53513480_3'UTR_DEL_AATCAGTTTTAGGCCATTCATGTCCTTCAAG-AATCAGTTTTAGGCCATTCATGTCCTTCAAG-TCAGTTTTAGGCCATTCATGTCCTTCA"],
 ["PTEN_GRCh37_10:89725231-89725231_3'UTR_DEL_T-T--",
  'DNMT3B_GRCh37_20:31394047-31394047_Silent_SNP_G-G-A']]

In [203]:
conditions

[False, True, False, False]

In [231]:
bootstrap_list = []
out_of_bag_list = []
def bootstrap(df, num_samples):
    for i in range(num_samples):
        bootstrap_sample = df.sample(int(df.shape[1]), replace=True, axis=1)
        out_of_bag_sample = df[[j for j in df.columns if j not in bootstrap_sample.columns]]
        bootstrap_list.append(bootstrap_sample)
        out_of_bag_list.append(out_of_bag_sample)

## Part 1

In [240]:
bootstrap_list = []
out_of_bag_list = []
bootstrap(df,1)
# size of bootstrap sample and out of the bag sample
print(bootstrap_list[0].shape)
print(out_of_bag_list[0].shape)
# list of out of the bag samples
print(out_of_bag_list[0].columns)

(8085, 250)
(8085, 96)
Index(['NC1', 'C1', 'NC3', 'NC4', 'NC7', 'NC10', 'NC11', 'C7', 'NC14', 'NC18',
       'NC19', 'C10', 'NC21', 'NC22', 'C12', 'C15', 'C16', 'C17', 'C18',
       'NC28', 'C19', 'NC30', 'NC31', 'NC33', 'C23', 'NC35', 'NC37', 'C24',
       'NC39', 'NC41', 'NC44', 'C31', 'C33', 'NC47', 'C35', 'NC48', 'C36',
       'C37', 'C38', 'C39', 'NC50', 'NC51', 'C41', 'NC54', 'NC55', 'NC56',
       'C43', 'NC60', 'NC62', 'C51', 'NC65', 'C52', 'C53', 'NC67', 'NC70',
       'C56', 'C58', 'NC72', 'NC75', 'C61', 'C62', 'C63', 'NC76', 'NC77',
       'C67', 'C74', 'NC87', 'NC88', 'NC89', 'C75', 'C79', 'NC91', 'NC96',
       'NC99', 'C84', 'NC103', 'NC105', 'NC107', 'NC108', 'NC113', 'C98',
       'C99', 'NC115', 'C101', 'NC118', 'NC119', 'NC120', 'C103', 'C105',
       'NC126', 'NC128', 'NC135', 'NC139', 'C107', 'NC141', 'C108'],
      dtype='object', name='class')


In [242]:
max_depth = 2
nodes = [[] for index in range(max_depth+1)]
conditions = []
# apply decision tree depth 2 on bootstrap sample
decision_tree_3(gain_finder,bootstrap_list[0],2)
# print root nodes and child nodes
print(nodes)

[['KCNH7_GRCh37_2:163374308-163374308_Missense-Mutation_SNP_G-G-A'], ['ADCY8_GRCh37_8:131916183-131916183_Silent_SNP_G-G-A', "SMEK1_GRCh37_14:91924426-91924426_3'UTR_DEL_A-A--"], ['GTPBP4_GRCh37_10:1058589-1058589_Missense-Mutation_SNP_G-G-A', 'GTPBP4_GRCh37_10:1058589-1058589_Missense-Mutation_SNP_G-G-A', 'ANKRD27_GRCh37_19:33096837-33096837_Silent_SNP_C-C-T', 'ANKRD27_GRCh37_19:33096837-33096837_Silent_SNP_C-C-T']]


## Part 2,3 and 4

In [389]:
def random_forest(df, func, max_depth, num_trees, individual):
    bootstrap_list = []
    out_of_bag_list = []
    def bootstrap(df, num_samples):
        for i in range(num_samples):
            bootstrap_sample = df.sample(int(df.shape[1]), replace=True, axis=1)
            out_of_bag_sample = df[[j for j in df.columns if j not in bootstrap_sample.columns]]
            bootstrap_list.append(bootstrap_sample)
            out_of_bag_list.append(out_of_bag_sample)
    bootstrap(df,num_trees)
    oob_size = [out_of_bag_list[j].shape[1] for j in range(num_trees)]
    oob_size_avg = sum(oob_size)/len(oob_size)
    forest = {}
    conds = {}
    #nodes = [[] for index in range(max_depth+1)]
    #conditions = []
    def decision_tree_3(func,df,max_depth,depth=0): # df here is training dataset
        if depth > max_depth:
            return nodes, conditions
        
        df = df.sample(math.ceil(df.shape[0]**0.5))

        root_node = func(df).sort_values(ascending=False).head(1).index[0]
        nodes[depth].append(root_node)
        df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1).any()]]
        df1 = df1.drop(root_node)
        df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0).any()]]
        df1 = df1.sample(math.ceil(df1.shape[0]**0.5))
        df2 = df2.sample(math.ceil(df2.shape[0]**0.5))
    
        if depth==max_depth:
            c1 = [i for i in df1.columns if i[0]=='C']
            nc1 = [j for j in df1.columns if j[0]=='N']
            c2 = [i for i in df2.columns if i[0]=='C']
            nc2 = [j for j in df2.columns if j[0]=='N']
            con1 = len(c1)>len(nc1)
            con2 = len(c2)>len(nc2)
            conditions.append(con1)
            conditions.append(con2)
        leaf_node_1 = decision_tree_3(func,df1,max_depth,depth+1)
        leaf_node_2 = decision_tree_3(func,df2,max_depth,depth+1)
        
    for i in range(num_trees):
        nodes = [[] for index in range(max_depth+1)]
        conditions = []
        decision_tree_3(func, bootstrap_list[i], max_depth)
        forest[i]=nodes
        conds[i]=conditions
    
    def classifier_depth2(df, individual, r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8):    
        # df here is test dataset
        #individual = list(df.columns)
        if (df[individual].loc[r0]==1).any():
            if (df[individual].loc[r1]==1).any():
                if (df[individual].loc[r3]==1).any():
                    return "C" if cn1 else "NC"
                else:
                    return "C" if cn2 else "NC"
            else:
                if (df[individual].loc[r4]==1).any():
                    return "C" if cn3 else "NC"
                else:
                    return "C" if cn4 else "NC"
        else:
            if (df[individual].loc[r2]==1).any():
                if (df[individual].loc[r5]==1).any():
                    return "C" if cn5 else "NC"
                else:
                    return "C" if cn6 else "NC"
            else:
                if (df[individual].loc[r6]==1).any():
                    return "C" if cn7 else "NC"
                else:
                    return "C" if cn8 else "NC"
    
    votes = []
    for k in range(num_trees):
        if individual in list(bootstrap_list[k].columns):
            r0 = forest[k][0][0]
            r1,r2 = forest[k][1]
            r3,r4,r5,r6 = forest[k][2]
            cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8 =conds[k]
            vot = classifier_depth2(bootstrap_list[k],individual,r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)
            votes.append(vot)
    majority_vote = max(set(votes), key=votes.count)
    
    return forest, oob_size, oob_size_avg, conds, votes, majority_vote

In [390]:
random_forest(df, gain_finder, 2, 10, "C10")

({0: [['PIK3CA_GRCh37_3:178916876-178916876_Missense-Mutation_SNP_G-G-A'],
   ["RPS6KB1_GRCh37_17:58024909-58024909_3'UTR_DEL_A-A--",
    'NES_GRCh37_1:156642804-156642804_Frame-Shift-Del_DEL_G-G--'],
   ["EML4_GRCh37_2:42558377-42558377_3'UTR_DEL_T-T--",
    'NAA16_GRCh37_13:41894959-41894959_Missense-Mutation_SNP_G-G-A',
    "CANX_GRCh37_5:179156881-179156881_3'UTR_DEL_T-T--",
    'NAV2_GRCh37_11:20117271-20117271_Silent_SNP_G-G-A']],
  1: [['TCF4_GRCh37_18:53068822-53068822_Intron_DEL_A-A--'],
   ['NEK10_GRCh37_3:27337131-27337131_Missense-Mutation_SNP_C-C-A',
    "MAF_GRCh37_16:79630857-79630857_3'UTR_DEL_A-A--"],
   ['C1R_GRCh37_12:7242010-7242010_Frame-Shift-Del_DEL_G-G--',
    'UGGT2_GRCh37_13:96506702-96506702_Nonsense-Mutation_SNP_G-G-A',
    'ZNF200_GRCh37_16:3283659-3283659_Nonsense-Mutation_SNP_G-G-A',
    'ATXN2L_GRCh37_16:28847350-28847350_Frame-Shift-Del_DEL_G-G--']],
  2: [['NSD1_GRCh37_5:176675269-176675269_Frame-Shift-Del_DEL_A-A--'],
   ['ZNF449_GRCh37_X:134494472-13

In [360]:
random_forest(df, gain_finder, 2, 10, "C1")

({0: [["ZFP91_GRCh37_11:58386428-58386428_3'UTR_DEL_T-T--"],
   ['ZAP70_GRCh37_2:98351785-98351785_Silent_SNP_C-C-T',
    "TMTC3_GRCh37_12:88590942-88590942_3'UTR_DEL_T-T--"],
   ['PTEN_GRCh37_10:89624267-89624267_Missense-Mutation_SNP_G-G-T',
    'ZAP70_GRCh37_2:98351785-98351785_Silent_SNP_C-C-T',
    'FBXW7_GRCh37_4:153244092-153244092_Missense-Mutation_SNP_G-G-A',
    'ALG5_GRCh37_13:37569725-37569725_Silent_SNP_G-G-A']],
  1: [["DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C"],
   ['TENC1_GRCh37_12:53456223-53456223_Missense-Mutation_SNP_G-G-A',
    'VGLL3_GRCh37_3:87017961-87017961_Missense-Mutation_SNP_C-C-T'],
   ["WNT7A_GRCh37_3:13860282-13860282_3'UTR_DEL_T-T--",
    "KIAA1551_GRCh37_12:32145533-32145533_3'UTR_SNP_C-C-T",
    "KRTAP2-3_GRCh37_17:39215890-39215890_3'UTR_SNP_C-C-T",
    'VGLL3_GRCh37_3:87017961-87017961_Missense-Mutation_SNP_C-C-T']],
  2: [["ZNF644_GRCh37_1:91382293-91382293_3'UTR_DEL_A-A--"],
   ["PPP1CB_GRCh37_2:29023920-29023920_3'UTR_DEL_T-T--",
    "ISL1_G

In [361]:
random_forest(df, gain_finder, 2, 10, "C50")

({0: [['POTEG_GRCh37_14:19563232-19563232_RNA_SNP_T-T-C'],
   ['PNMA5_GRCh37_X:152159299-152159299_Missense-Mutation_SNP_G-G-A',
    'PRKAG2_GRCh37_7:151504031-151504031_RNA_DEL_A-A--'],
   ['MTSS1L_GRCh37_16:70698147-70698147_Frame-Shift-Del_DEL_G-G--',
    'ZKSCAN2_GRCh37_16:25252049-25252049_Silent_SNP_G-G-A',
    "BCL7A_GRCh37_12:122498697-122498698_3'UTR_INS_----T",
    'PRKAG2_GRCh37_7:151504031-151504031_RNA_DEL_A-A--']],
  1: [['KLF3_GRCh37_4:38691454-38691454_Frame-Shift-Del_DEL_C-C--'],
   ['GABRB3_GRCh37_15:26793203-26793203_Missense-Mutation_SNP_C-C-T',
    "RAB7A_GRCh37_3:128533255-128533255_3'UTR_DEL_T-T--"],
   ["ABHD13_GRCh37_13:108884458-108884458_3'UTR_DEL_A-A--",
    'TM9SF2_GRCh37_13:100191808-100191808_Silent_SNP_C-C-T',
    'TTN_GRCh37_2:179528226-179528226_Intron_SNP_C-C-A',
    'UBE2QL1_GRCh37_5:6491376-6491376_Missense-Mutation_SNP_C-C-T_C-C-A']],
  2: [["KDM5B_GRCh37_1:202697904-202697905_3'UTR_INS_----A"],
   ['HLA-DPB1_GRCh37_6:33048450-33048450_Missense-Mut

In [362]:
random_forest(df, gain_finder, 2, 10, "NC15")

({0: [['LMAN1_GRCh37_18:57013285-57013285_Splice-Site_DEL_T-T--'],
   ['PCDHGB3_GRCh37_5:140751754-140751754_Missense-Mutation_SNP_C-C-T',
    'PTENP1_GRCh37_9:33674810-33674810_RNA_SNP_A-A-G'],
   ['FPR3_GRCh37_19:52327411-52327411_Missense-Mutation_SNP_G-G-A',
    'CITED1_GRCh37_X:71522667-71522667_Missense-Mutation_SNP_G-G-A',
    'KIAA0825_GRCh37_5:93820513-93820513_Missense-Mutation_SNP_C-C-T',
    'KIAA0825_GRCh37_5:93820513-93820513_Missense-Mutation_SNP_C-C-T']],
  1: [['SYNJ2_GRCh37_6:158508009-158508009_Frame-Shift-Del_DEL_C-C--'],
   ['SPRED1_GRCh37_15:38643737-38643737_Nonsense-Mutation_SNP_C-C-T',
    "PDXK_GRCh37_21:45177870-45177870_3'UTR_SNP_T-T-A"],
   ["FMR1_GRCh37_X:147031603-147031603_3'UTR_DEL_T-T--",
    'SPRED1_GRCh37_15:38643737-38643737_Nonsense-Mutation_SNP_C-C-T',
    'ZCCHC4_GRCh37_4:25353268-25353268_Missense-Mutation_SNP_G-G-A',
    'AFF3_GRCh37_2:100199424-100199424_Missense-Mutation_SNP_G-G-A']],
  2: [["ZFP91_GRCh37_11:58386428-58386428_3'UTR_DEL_T-T--"

In [363]:
random_forest(df, gain_finder, 2, 10, "NC5")

({0: [['MGAM_GRCh37_7:141805626-141805626_Missense-Mutation_SNP_G-G-A'],
   ["KIAA2022_GRCh37_X:73958669-73958669_3'UTR_SNP_G-G-A",
    'CRELD2_GRCh37_22:50313429-50313429_Silent_SNP_C-C-T'],
   ["SIK1_GRCh37_21:44834508-44834508_3'UTR_DEL_A-A--",
    "KIAA2022_GRCh37_X:73958669-73958669_3'UTR_SNP_G-G-A",
    'ZBTB1_GRCh37_14:64989825-64989825_Nonsense-Mutation_SNP_C-C-T',
    'DIAPH2_GRCh37_X:96185790-96185790_Missense-Mutation_SNP_G-G-A']],
  1: [["SLMAP_GRCh37_3:57913670-57913670_3'UTR_DEL_T-T--"],
   ['POF1B_GRCh37_X:84559360-84559360_Missense-Mutation_SNP_G-G-A',
    'MB21D2_GRCh37_3:192516280-192516280_Frame-Shift-Del_DEL_T-T--'],
   ['TSSK1B_GRCh37_5:112769636-112769636_Missense-Mutation_SNP_C-C-T',
    'POF1B_GRCh37_X:84559360-84559360_Missense-Mutation_SNP_G-G-A',
    'BOD1L1_GRCh37_4:13610189-13610189_Frame-Shift-Del_DEL_T-T--',
    'BOD1L1_GRCh37_4:13610189-13610189_Frame-Shift-Del_DEL_T-T--']],
  2: [["RBFOX1_GRCh37_16:7762402-7762402_3'Flank_DEL_A-A--"],
   ["HMGN1_GRCh37_

In [398]:
def random_forest(df, func, max_depth, num_trees):
    bootstrap_list = []
    out_of_bag_list = []
    def bootstrap(df, num_samples):
        for i in range(num_samples):
            bootstrap_sample = df.sample(int(df.shape[1]), replace=True, axis=1)
            out_of_bag_sample = df[[j for j in df.columns if j not in bootstrap_sample.columns]]
            bootstrap_list.append(bootstrap_sample)
            out_of_bag_list.append(out_of_bag_sample)
    bootstrap(df,num_trees)
    oob_size = [out_of_bag_list[j].shape[1] for j in range(num_trees)]
    oob_size_avg = sum(oob_size)/len(oob_size)
    forest = {}
    conds = {}
    #nodes = [[] for index in range(max_depth+1)]
    #conditions = []
    def decision_tree_3(func,df,max_depth,depth=0): # df here is training dataset
        if depth > max_depth:
            return nodes, conditions
        
        df = df.sample(math.ceil(df.shape[0]**0.5))

        root_node = func(df).sort_values(ascending=False).head(1).index[0]
        nodes[depth].append(root_node)
        df1 = df[[ j for j in df.columns if (df[j].loc[root_node]==1).any()]]
        df1 = df1.drop(root_node)
        df2 = df[[ j for j in df.columns if (df[j].loc[root_node]==0).any()]]
        df1 = df1.sample(math.ceil(df1.shape[0]**0.5))
        df2 = df2.sample(math.ceil(df2.shape[0]**0.5))
    
        if depth==max_depth:
            c1 = [i for i in df1.columns if i[0]=='C']
            nc1 = [j for j in df1.columns if j[0]=='N']
            c2 = [i for i in df2.columns if i[0]=='C']
            nc2 = [j for j in df2.columns if j[0]=='N']
            con1 = len(c1)>len(nc1)
            con2 = len(c2)>len(nc2)
            conditions.append(con1)
            conditions.append(con2)
        leaf_node_1 = decision_tree_3(func,df1,max_depth,depth+1)
        leaf_node_2 = decision_tree_3(func,df2,max_depth,depth+1)
        
    for i in range(num_trees):
        nodes = [[] for index in range(max_depth+1)]
        conditions = []
        decision_tree_3(func, bootstrap_list[i], max_depth)
        forest[i]=nodes
        conds[i]=conditions
    
    def classifier_depth2(df, individual, r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8):    
        # df here is test dataset
        #individual = list(df.columns)
        if (df[individual].loc[r0]==1).any():
            if (df[individual].loc[r1]==1).any():
                if (df[individual].loc[r3]==1).any():
                    return "C" if cn1 else "NC"
                else:
                    return "C" if cn2 else "NC"
            else:
                if (df[individual].loc[r4]==1).any():
                    return "C" if cn3 else "NC"
                else:
                    return "C" if cn4 else "NC"
        else:
            if (df[individual].loc[r2]==1).any():
                if (df[individual].loc[r5]==1).any():
                    return "C" if cn5 else "NC"
                else:
                    return "C" if cn6 else "NC"
            else:
                if (df[individual].loc[r6]==1).any():
                    return "C" if cn7 else "NC"
                else:
                    return "C" if cn8 else "NC"
    
    #votes = []
    #for k in range(num_trees):
    #    if individual in list(bootstrap_list[k].columns):
    #        r0 = forest[k][0][0]
    #        r1,r2 = forest[k][1]
    #        r3,r4,r5,r6 = forest[k][2]
    #        cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8 =conds[k]
    #        vot = classifier_depth2(bootstrap_list[k],individual,r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)
    #        votes.append(vot)
    #majority_vote = max(set(votes), key=votes.count)
    
    def evaluator(df, classifier, r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8):
        # df here is test dataset
        TN = FN = TP = FP = 0
        individuals = list(df.columns)
        for i in range(df.shape[1]):
            if individuals[i][0] == classifier(df, individuals[i],r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)=="C":
                TP +=1
            elif individuals[i][0:2] == classifier(df, individuals[i], r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)=="NC":
                TN +=1
            elif individuals[i][0]=="N" and classifier(df, individuals[i],r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)=="C":
                FP +=1
            elif individuals[i][0]=="C" and classifier(df, individuals[i], r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)=="NC":
                FN +=1
            
        return [TP, TN, FP, FN]
    def metrics_calc(conf_mat):
        # Format of conf_mat should be: [TP, TN, FP, FN]
        TP = conf_mat[0]
        TN = conf_mat[1]
        FP = conf_mat[2]
        FN = conf_mat[3]
        accuracy = (TP+TN)/(TP+TN+FP+FN)
        sensitivity = TP/(TP+FN+1)
        specificity = TN/(TN+FP+1)
        precision = TP/(TP+FP+1)
        miss_rate = FN/(FN+TP+1)
        false_discovery_rate = FP/(FP+TP+1)
        false_omission_rate = FN/(FN+TN+1)

        return f""" 
        Accuracy is {accuracy:.2}
        Sensitivity is {sensitivity:.2}
        Specificity is {specificity:.2}
        Precision is {precision:.2}
        Miss Rate is {miss_rate:.2}
        False discovery rate is {false_discovery_rate:.2} 
        False omission rate is {false_omission_rate:.2} 
        """

    conf_mat = {}
    for k in range(num_trees):
        #if individual in list(out_of_bag_list[k].columns):
        r0 = forest[k][0][0]
        r1,r2 = forest[k][1]
        r3,r4,r5,r6 = forest[k][2]
        cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8 =conds[k]
        conf = evaluator(out_of_bag_list[k],classifier_depth2,r0,r1,r2,r3,r4,r5,r6, cn1,cn2,cn3,cn4,cn5,cn6,cn7,cn8)
        conf_mat[k] = conf
     
    for e in range(num_trees):
        print(metrics_calc(conf_mat[e]))
    
    return forest, conf_mat , oob_size#votes, majority_vote


In [399]:
random_forest(df, gain_finder, 2, 10)

 
        Accuracy is 0.46
        Sensitivity is 0.98
        Specificity is 0.0
        Precision is 0.46
        Miss Rate is 0.0
        False discovery rate is 0.53 
        False omission rate is 0.0 
        
 
        Accuracy is 0.61
        Sensitivity is 0.0
        Specificity is 0.98
        Precision is 0.0
        Miss Rate is 0.97
        False discovery rate is 0.0 
        False omission rate is 0.38 
        
 
        Accuracy is 0.59
        Sensitivity is 0.049
        Specificity is 0.96
        Precision is 0.5
        Miss Rate is 0.93
        False discovery rate is 0.25 
        False omission rate is 0.41 
        
 
        Accuracy is 0.42
        Sensitivity is 0.95
        Specificity is 0.0
        Precision is 0.42
        Miss Rate is 0.026
        False discovery rate is 0.57 
        False omission rate is 0.5 
        
 
        Accuracy is 0.58
        Sensitivity is 0.023
        Specificity is 0.98
        Precision is 0.5
        Miss Rate is 0

({0: [['ARID5B_GRCh37_10:63850705-63850705_Frame-Shift-Del_DEL_A-A--'],
   ['PKD1L1_GRCh37_7:47892822-47892822_Missense-Mutation_SNP_G-G-T',
    'PTEN_GRCh37_10:89692923-89692923_Missense-Mutation_SNP_G-G-A_G-G-T'],
   ['DHRS7C_GRCh37_17:9674865-9674865_Missense-Mutation_SNP_G-G-T',
    'NGF_GRCh37_1:115829175-115829175_Missense-Mutation_SNP_C-C-T_C-C-A',
    "SYDE2_GRCh37_1:85624415-85624415_3'UTR_DEL_A-A--",
    "SYDE2_GRCh37_1:85624415-85624415_3'UTR_DEL_A-A--"]],
  1: [['PLEKHG7_GRCh37_12:93135357-93135357_Missense-Mutation_SNP_G-G-A'],
   ["TBL1X_GRCh37_X:9684636-9684636_3'UTR_DEL_A-A--",
    'ZNRD1_GRCh37_6:30024687-30024687_Intron_DEL_A-A--'],
   ["C5orf30_GRCh37_5:102613439-102613439_3'UTR_DEL_A-A--",
    'ASB7_GRCh37_15:101188569-101188569_Nonsense-Mutation_SNP_C-C-T',
    'UBE4A_GRCh37_11:118239407-118239407_Silent_SNP_C-C-T',
    'ZNRD1_GRCh37_6:30024687-30024687_Intron_DEL_A-A--']],
  2: [["SHC1_GRCh37_1:154935863-154935863_3'UTR_DEL_G-G--"],
   ['PAQR9_GRCh37_3:142681357-1

In [402]:
df

class,NC1,C1,NC2,NC3,NC4,NC5,NC6,C2,C3,NC7,...,NC135,NC136,NC137,NC138,NC139,NC140,C107,NC141,C108,NC142
GOT1_GRCh37_10:101163586-101163586_Missense-Mutation_SNP_C-C-T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TEX36_GRCh37_10:127371546-127371546_Nonsense-Mutation_SNP_G-G-A,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIAA1217_GRCh37_10:24810824-24810824_Missense-Mutation_SNP_C-C-T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SIRT1_GRCh37_10:69676051-69676051_Missense-Mutation_SNP_C-C-T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MYPN_GRCh37_10:69881845-69881845_Missense-Mutation_SNP_C-C-T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PTENP1_GRCh37_9:33674774-33674774_RNA_DEL_T-T--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KMT2C_GRCh37_7:151836804-151836804_Nonsense-Mutation_SNP_G-G-A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
CDKN1C_GRCh37_11:2904846-2904846_3'UTR_DEL_T-T--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PIK3R2_GRCh37_19:18278070-18278070_Missense-Mutation_SNP_A-A-G,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
