In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(rc={"lines.linewidth": 2}, palette  = "deep", style = "ticks")
from itertools import product, permutations, combinations, combinations_with_replacement
from sklearn.metrics import precision_recall_curve, roc_curve, auc
import matplotlib.pyplot as plt

In [3]:
model1 = "non_filter"  #non_filter OR filter OR multiply
model2 = "multiply"
hvg_version = "sct"  #sct OR normdata
data_version = "count"  #norm OR count
directory = "/home/zc354/network_2000/"

pred_path_1=directory+model1+'_'+hvg_version+'_'+data_version+'.csv'
pred_path_2=directory+model2+'_'+hvg_version+'_'+data_version+'.csv'

true_path='/gpfs/gibbs/pi/zhao/yw599/GRN/BEELINE-Networks/Networks/human/Non-specific-ChIP-seq-network.csv'

In [4]:
trueEdgesDF = pd.read_csv(true_path,sep = ',', header = 0, index_col = None)
predEdgesDF_1 = pd.read_csv(pred_path_1, sep = ',', header =  0, index_col = None)
predEdgesDF_2 = pd.read_csv(pred_path_2, sep = ',', header =  0, index_col = None)

In [5]:
#Unification changes lowercase letters to uppercase letters
trueEdgesDF['Gene1'] = trueEdgesDF['Gene1'].str.upper()
trueEdgesDF['Gene2'] = trueEdgesDF['Gene2'].str.upper()

predEdgesDF_1['Gene1'] = predEdgesDF_1['Gene1'].str.upper()
predEdgesDF_1['Gene2'] = predEdgesDF_1['Gene2'].str.upper()

predEdgesDF_2['Gene1'] = predEdgesDF_2['Gene1'].str.upper()
predEdgesDF_2['Gene2'] = predEdgesDF_2['Gene2'].str.upper()

In [6]:
#Drop index in predEdgesDF that Edgeweight=0
predEdgesDF_1.drop(predEdgesDF_1[predEdgesDF_1['Edgeweight'] == 0].index,inplace=True)

predEdgesDF_2.drop(predEdgesDF_2[predEdgesDF_2['Edgeweight'] == 0].index,inplace=True)

In [7]:
#Drop selfEdges
trueEdgesDF = trueEdgesDF.loc[(trueEdgesDF['Gene1'] != trueEdgesDF['Gene2'])]
predEdgesDF_1 = predEdgesDF_1.loc[(predEdgesDF_1['Gene1'] != predEdgesDF_1['Gene2'])]

predEdgesDF_2 = predEdgesDF_2.loc[(predEdgesDF_2['Gene1'] != predEdgesDF_2['Gene2'])]

In [8]:
#Drop duplicates in trueEdgesDF and predEdgesDF
trueEdgesDF.drop_duplicates(keep = 'first', inplace=True)
predEdgesDF_1.drop_duplicates(keep = 'first',subset=['Gene1','Gene2'],inplace=True)
predEdgesDF_2.drop_duplicates(keep = 'first',subset=['Gene1','Gene2'],inplace=True)

In [9]:
print(len(set(trueEdgesDF['Gene1'])))
print(len(set(trueEdgesDF['Gene2'])))

print(len(set(predEdgesDF_1['Gene1'])))
print(len(set(predEdgesDF_1['Gene2'])))

print(len(set(predEdgesDF_2['Gene1'])))
print(len(set(predEdgesDF_2['Gene2'])))

2157
23377
387
2000
383
1762


In [10]:
true_TF = set(trueEdgesDF['Gene1'])
true_tg = set(trueEdgesDF['Gene2'])

pred_TF_1 = set(predEdgesDF_1['Gene1'])
pred_tg_1 = set(predEdgesDF_1['Gene2'])

pred_TF_2 = set(predEdgesDF_2['Gene1'])
pred_tg_2 = set(predEdgesDF_2['Gene2'])

In [11]:
TFset_1 = (true_TF & pred_TF_1) 
tgset_1 = (true_tg & pred_tg_1)

TFset_2 = (true_TF & pred_TF_2) 
tgset_2 = (true_tg & pred_tg_2)

In [12]:
print(len(TFset_1))
print(len(TFset_2))
print(len(tgset_1))
print(len(tgset_2))

197
194
1783
1571


In [13]:
TFset = TFset_1 & TFset_2
tgset = tgset_1 & tgset_2

In [14]:
print(len(TFset))
print(len(tgset))

194
1571


In [15]:
#trueEdgesDF = trueEdgesDF[(trueEdgesDF['Gene1'].isin(TFset_2))&(trueEdgesDF['Gene2'].isin(tgset_2))]
predEdgesDF_1 = predEdgesDF_1[(predEdgesDF_1['Gene1'].isin(TFset_1))&(predEdgesDF_1['Gene2'].isin(tgset_1))]
predEdgesDF_2 = predEdgesDF_2[(predEdgesDF_2['Gene1'].isin(TFset_2))&(predEdgesDF_2['Gene2'].isin(tgset_2))]

In [16]:
TrueEdgesDF = pd.DataFrame(columns=['Edge'])
TrueEdgesDF['Edge'] = trueEdgesDF['Gene1'].map(str) +"|"+ trueEdgesDF['Gene2'].map(str)
PredEdgesDF_1 = pd.DataFrame(columns=['Edge','Weight'])
PredEdgesDF_1['Edge'] = predEdgesDF_1['Gene1'].map(str) +"|"+ predEdgesDF_1['Gene2'].map(str)
PredEdgesDF_1['Weight'] = predEdgesDF_1['Edgeweight']

PredEdgesDF_2 = pd.DataFrame(columns=['Edge','Weight'])
PredEdgesDF_2['Edge'] = predEdgesDF_2['Gene1'].map(str) +"|"+ predEdgesDF_2['Gene2'].map(str)
PredEdgesDF_2['Weight'] = predEdgesDF_2['Edgeweight']

PredEdgesDF_1 = PredEdgesDF_1.sort_values(by=['Weight'],ascending=False)
PredEdgesDF_2 = PredEdgesDF_2.sort_values(by=['Weight'],ascending=False)

In [17]:
print(len(PredEdgesDF_1))
print(len(PredEdgesDF_2))

345417
58093


In [18]:
trueEdgesset= set(TrueEdgesDF['Edge'])

In [19]:
def num_topk(predEdgesDF = predEdgesDF_1, k = 10000):
    maxk = predEdgesDF.shape[0]
    if k>maxk:
        print('k should be smaller than'+str(maxk))
    edgeweightTopk = predEdgesDF.iloc[k-1].Weight
    
    nonZeroMin = np.nanmin(predEdgesDF.Weight.replace(0, np.nan).values)
    bestVal = max(nonZeroMin, edgeweightTopk)
    dataset = {}
    newDF = predEdgesDF.loc[(predEdgesDF['Weight'] >= bestVal)]
    dataset = set(newDF['Edge'])
    
    num=len(dataset & trueEdgesset)
    return edgeweightTopk,num

In [20]:
num_topk(predEdgesDF = PredEdgesDF_2, k = 10000)

(0.0254940202455673, 282)

In [26]:
def topk(predEdgesDF = predEdgesDF_1,step=10):
    n = len(predEdgesDF)/10
    Truep = np.zeros(int(n))
    weight = np.zeros(int(n))
    for i in range(0,int(n)):
        weight[i],Truep[i] = num_topk(predEdgesDF = predEdgesDF, k = (i+1)*step)
    return weight,Truep

In [22]:
weight_1,Truep_1 = topk(predEdgesDF = PredEdgesDF_1,step=10)

In [91]:
Truep_1[1:100]

array([ 3.,  4.,  4.,  4.,  4.,  5.,  5.,  6.,  6.,  6.,  6.,  7.,  7.,
        7., 10., 10., 10., 12., 13., 13., 13., 13., 13., 14., 15., 16.,
       16., 17., 17., 17., 18., 19., 20., 21., 22., 22., 22., 23., 24.,
       24., 24., 24., 24., 25., 28., 29., 30., 30., 30., 31., 31., 32.,
       33., 34., 34., 35., 35., 36., 37., 37., 38., 38., 38., 38., 38.,
       38., 38., 40., 41., 41., 41., 41., 42., 44., 44., 47., 48., 51.,
       51., 51., 52., 53., 53., 53., 53., 53., 53., 53., 53., 53., 53.,
       54., 54., 55., 55., 55., 55., 55., 56.])

In [24]:
weight_2,Truep_2 = topk(predEdgesDF = PredEdgesDF_2,step=10)

In [104]:
Truep_2[0:100]

array([ 0.,  0.,  0.,  2.,  4.,  8.,  9., 11., 13., 14., 14., 15., 17.,
       17., 17., 18., 19., 20., 21., 21., 22., 24., 24., 26., 26., 27.,
       28., 29., 29., 30., 31., 32., 34., 35., 36., 36., 37., 37., 38.,
       38., 38., 39., 41., 41., 42., 42., 44., 44., 44., 47., 50., 51.,
       51., 51., 52., 54., 55., 55., 56., 56., 57., 57., 58., 59., 61.,
       62., 62., 62., 63., 63., 64., 65., 65., 65., 66., 67., 67., 67.,
       68., 68., 69., 70., 70., 70., 70., 71., 71., 71., 71., 72., 72.,
       72., 73., 73., 74., 74., 75., 75., 75., 76.])

In [29]:
k_1 = np.zeros(len(Truep_1))
for i in range(0,len(Truep_1)):
    k_1[i]=(i+1)*10
k_2 = np.zeros(len(Truep_2))
for i in range(0,len(Truep_2)):
    k_2[i]=(i+1)*10

In [30]:
precision_1 = Truep_1/k_1

In [56]:
precision_1[0:100]

array([0.3       , 0.15      , 0.13333333, 0.1       , 0.08      ,
       0.06666667, 0.07142857, 0.0625    , 0.06666667, 0.06      ,
       0.05454545, 0.05      , 0.05384615, 0.05      , 0.04666667,
       0.0625    , 0.05882353, 0.05555556, 0.06315789, 0.065     ,
       0.06190476, 0.05909091, 0.05652174, 0.05416667, 0.056     ,
       0.05769231, 0.05925926, 0.05714286, 0.05862069, 0.05666667,
       0.05483871, 0.05625   , 0.05757576, 0.05882353, 0.06      ,
       0.06111111, 0.05945946, 0.05789474, 0.05897436, 0.06      ,
       0.05853659, 0.05714286, 0.05581395, 0.05454545, 0.05555556,
       0.06086957, 0.06170213, 0.0625    , 0.06122449, 0.06      ,
       0.06078431, 0.05961538, 0.06037736, 0.06111111, 0.06181818,
       0.06071429, 0.06140351, 0.06034483, 0.06101695, 0.06166667,
       0.06065574, 0.06129032, 0.06031746, 0.059375  , 0.05846154,
       0.05757576, 0.05671642, 0.05588235, 0.05797101, 0.05857143,
       0.05774648, 0.05694444, 0.05616438, 0.05675676, 0.05866

In [103]:
precision_2[0:100]

array([0.        , 0.        , 0.        , 0.05      , 0.08      ,
       0.13333333, 0.12857143, 0.1375    , 0.14444444, 0.14      ,
       0.12727273, 0.125     , 0.13076923, 0.12142857, 0.11333333,
       0.1125    , 0.11176471, 0.11111111, 0.11052632, 0.105     ,
       0.1047619 , 0.10909091, 0.10434783, 0.10833333, 0.104     ,
       0.10384615, 0.1037037 , 0.10357143, 0.1       , 0.1       ,
       0.1       , 0.1       , 0.1030303 , 0.10294118, 0.10285714,
       0.1       , 0.1       , 0.09736842, 0.0974359 , 0.095     ,
       0.09268293, 0.09285714, 0.09534884, 0.09318182, 0.09333333,
       0.09130435, 0.09361702, 0.09166667, 0.08979592, 0.094     ,
       0.09803922, 0.09807692, 0.09622642, 0.09444444, 0.09454545,
       0.09642857, 0.09649123, 0.09482759, 0.09491525, 0.09333333,
       0.09344262, 0.09193548, 0.09206349, 0.0921875 , 0.09384615,
       0.09393939, 0.09253731, 0.09117647, 0.09130435, 0.09      ,
       0.09014085, 0.09027778, 0.0890411 , 0.08783784, 0.088  

In [33]:
precision_1[len(precision_1)-1]=1

In [97]:
precision_2 = Truep_2/k_2

In [102]:
Truep_2[2]/k_2[2]

0.0

In [37]:
precision_2[len(precision_2)-1]=1

In [170]:
recall_2 = Truep_2/5309

In [46]:
Truep_2

array([   0.,    0.,    0., ..., 1427., 1427., 1427.])

In [179]:
recall_1 = Truep_1/5994

In [48]:
Truep_1

array([3.000e+00, 3.000e+00, 4.000e+00, ..., 5.966e+03, 5.966e+03,
       5.966e+03])

In [49]:
recall_1[len(recall_1)-1]=1
recall_2[len(recall_2)-1]=1

In [78]:
recall_1.sort()
recall_1 = abs(np.sort(-recall_1)) 

In [52]:
recall_1 

array([1.00000000e+00, 9.95328662e-01, 9.95328662e-01, ...,
       6.67334001e-04, 5.00500501e-04, 5.00500501e-04])

In [79]:
recall_2.sort()
recall_2 = abs(np.sort(-recall_2)) 

In [54]:
auc(recall_1,precision_1)

0.02322314564246307

In [55]:
auc(recall_2,precision_2)

0.006891920460056722

In [192]:
topk_multiply_10 = pd.read_csv('/gpfs/gibbs/pi/zhao/zc354/GRN/output/topk_multiply_10.csv', sep = ',')

In [193]:
topk_multiply_10.sort_values(by=['k'])

Unnamed: 0,threshold,k,precision,recall
3258,3.800474e-01,10.0,0.000000,0.000000
3257,3.255091e-01,20.0,0.000000,0.000000
3256,2.501784e-01,40.0,0.062500,0.000377
3255,2.225428e-01,50.0,0.125000,0.000753
3254,2.094634e-01,60.0,0.163265,0.001507
...,...,...,...,...
4,2.300526e-08,58020.0,0.044004,0.268600
3,1.768529e-08,58030.0,0.043995,0.268600
2,1.119356e-08,58040.0,0.044017,0.268600
1,4.140573e-09,58060.0,0.044002,0.268789


In [194]:
topk_multiply = pd.DataFrame(weight_2)

In [195]:
topk_multiply['k']=k_2
topk_multiply['precision_k']=precision_2[0:len(precision_2)]
topk_multiply['recall_k']=recall_2[0:len(recall_2)]
topk_multiply['Truep']=Truep_2[0:len(Truep_2)]
topk_multiply.columns = ['threshold_k','k','precision_k','recall_k','Truep']

In [196]:
topk_multiply

Unnamed: 0,threshold_k,k,precision_k,recall_k,Truep
0,3.800474e-01,10.0,0.000000,0.000000,0.0
1,3.255091e-01,20.0,0.000000,0.000000,0.0
2,2.831121e-01,30.0,0.000000,0.000000,0.0
3,2.501784e-01,40.0,0.050000,0.000377,2.0
4,2.225428e-01,50.0,0.080000,0.000753,4.0
...,...,...,...,...,...
5804,7.536888e-09,58050.0,0.024582,0.268789,1427.0
5805,4.140573e-09,58060.0,0.024578,0.268789,1427.0
5806,2.667503e-09,58070.0,0.024574,0.268789,1427.0
5807,1.371241e-09,58080.0,0.024570,0.268789,1427.0


In [197]:
merge = topk_multiply_10.merge(topk_multiply, on='k', how='left')

In [198]:
merge.sort_values(by=['k']).iloc[0:20]

Unnamed: 0,threshold,k,precision,recall,threshold_k,precision_k,recall_k,Truep
3258,0.380047,10.0,0.0,0.0,0.380047,0.0,0.0,0.0
3257,0.325509,20.0,0.0,0.0,0.325509,0.0,0.0,0.0
3256,0.250178,40.0,0.0625,0.000377,0.250178,0.05,0.000377,2.0
3255,0.222543,50.0,0.125,0.000753,0.222543,0.08,0.000753,4.0
3254,0.209463,60.0,0.163265,0.001507,0.209463,0.133333,0.001507,8.0
3253,0.201559,70.0,0.160714,0.001695,0.201559,0.128571,0.001695,9.0
3252,0.197744,80.0,0.171875,0.002072,0.197744,0.1375,0.002072,11.0
3251,0.191848,90.0,0.180556,0.002449,0.191848,0.144444,0.002449,13.0
3250,0.185002,100.0,0.179487,0.002637,0.185002,0.14,0.002637,14.0
3249,0.17816,110.0,0.16092,0.002637,0.17816,0.127273,0.002637,14.0


In [199]:
merge.sort_values(by=['k']).iloc[len(merge)-20:len(merge)]

Unnamed: 0,threshold,k,precision,recall,threshold_k,precision_k,recall_k,Truep
19,2.638645e-07,57810.0,0.044198,0.2686,2.638645e-07,0.024667,0.2686,1426.0
18,2.307804e-07,57830.0,0.044183,0.2686,2.307804e-07,0.024658,0.2686,1426.0
17,2.133264e-07,57840.0,0.044172,0.2686,2.133264e-07,0.024654,0.2686,1426.0
16,1.577925e-07,57880.0,0.044129,0.2686,1.577925e-07,0.024637,0.2686,1426.0
15,1.477441e-07,57890.0,0.044123,0.2686,1.477441e-07,0.024633,0.2686,1426.0
14,1.312144e-07,57900.0,0.044113,0.2686,1.312144e-07,0.024629,0.2686,1426.0
13,1.199854e-07,57910.0,0.044106,0.2686,1.199854e-07,0.024624,0.2686,1426.0
12,1.101867e-07,57920.0,0.044097,0.2686,1.101867e-07,0.02462,0.2686,1426.0
11,1.032977e-07,57930.0,0.044086,0.2686,1.032977e-07,0.024616,0.2686,1426.0
10,8.462546e-08,57950.0,0.044069,0.2686,8.462546e-08,0.024607,0.2686,1426.0


In [200]:
merge.to_csv("/gpfs/gibbs/pi/zhao/zc354/GRN/output/topk_multiply_merge.csv",index=False,sep=',')

In [201]:
merge_multiply_non_filter = pd.DataFrame(k_2)

In [202]:
merge_multiply_non_filter ['Truep_multiply']=Truep_2
merge_multiply_non_filter ['Truep_non_filter']=Truep_1[0:len(Truep_2)]
merge_multiply_non_filter ['diff'] = merge_multiply_non_filter ['Truep_non_filter'] - merge_multiply_non_filter ['Truep_multiply']

In [203]:
merge_multiply_non_filter [800:1000]

Unnamed: 0,0,Truep_multiply,Truep_non_filter,diff
800,8010.0,249.0,374.0,125.0
801,8020.0,249.0,374.0,125.0
802,8030.0,249.0,375.0,126.0
803,8040.0,249.0,375.0,126.0
804,8050.0,250.0,375.0,125.0
...,...,...,...,...
995,9960.0,281.0,428.0,147.0
996,9970.0,282.0,429.0,147.0
997,9980.0,282.0,429.0,147.0
998,9990.0,282.0,429.0,147.0


In [204]:
merge_multiply_non_filter.to_csv("/gpfs/gibbs/pi/zhao/zc354/GRN/output/difftopk_multiply_non_filter.csv",index=False,sep=',')

In [205]:
topk_non_filter = pd.DataFrame(weight_1)
topk_non_filter['k']=k_1
topk_non_filter['precision_k']=precision_1[0:len(precision_1)]
topk_non_filter['recall_k']=recall_1[0:len(recall_1)]
topk_non_filter['Truep']=Truep_1[0:len(Truep_1)]
topk_non_filter.columns = ['threshold_k','k','precision_k','recall_k','Truep']

In [206]:
topk_non_filter_10 = pd.read_csv('/gpfs/gibbs/pi/zhao/zc354/GRN/output/topk_non_filter_10.csv', sep = ',')

In [207]:
merge_2 = topk_non_filter_10.merge(topk_non_filter, on='k', how='left')

In [208]:
merge_2.sort_values(by=['k']).iloc[0:20]

Unnamed: 0,threshold,k,precision,recall,threshold_k,precision_k,recall_k,Truep
19920,0.230108,10.0,0.333333,0.000501,0.230108,0.3,0.000501,3.0
19919,0.187355,20.0,0.157895,0.000501,0.187355,0.15,0.000501,3.0
19918,0.158202,40.0,0.117647,0.000667,0.158202,0.1,0.000667,4.0
19917,0.149167,50.0,0.1,0.000667,0.149167,0.08,0.000667,4.0
19916,0.142374,60.0,0.083333,0.000667,0.142374,0.066667,0.000667,4.0
19915,0.139339,70.0,0.087719,0.000667,0.139339,0.071429,0.000834,5.0
19914,0.131069,80.0,0.078125,0.000834,0.131069,0.0625,0.000834,5.0
19913,0.127875,90.0,0.081081,0.001001,0.127875,0.066667,0.001001,6.0
19912,0.12466,100.0,0.075,0.001001,0.12466,0.06,0.001001,6.0
19911,0.117513,110.0,0.068182,0.001001,0.117513,0.054545,0.001001,6.0


In [169]:
merge_2.sort_values(by=['k']).iloc[len(merge_2)-20:len(merge_2)]

Unnamed: 0,threshold,k,precision,recall,threshold_k,precision_k,recall_k,Truep
19,1.408824e-07,345050.0,0.029844,0.995329,1.408824e-07,0.01729,0.00367,5966.0
18,1.324932e-07,345070.0,0.029842,0.995329,1.324932e-07,0.017289,0.003504,5966.0
17,1.249213e-07,345080.0,0.029841,0.995329,1.249213e-07,0.017289,0.003337,5966.0
16,1.221999e-07,345090.0,0.02984,0.995329,1.221999e-07,0.017288,0.00317,5966.0
15,1.161635e-07,345110.0,0.029838,0.995329,1.161635e-07,0.017287,0.002836,5966.0
14,1.12148e-07,345120.0,0.029836,0.995329,1.12148e-07,0.017287,0.002836,5966.0
13,1.073529e-07,345130.0,0.029835,0.995329,1.073529e-07,0.017286,0.002836,5966.0
12,9.687267e-08,345160.0,0.029832,0.995329,9.687267e-08,0.017285,0.002503,5966.0
11,9.278334e-08,345170.0,0.029831,0.995329,9.278334e-08,0.017284,0.002336,5966.0
10,8.425933e-08,345190.0,0.029829,0.995329,8.425933e-08,0.017283,0.002169,5966.0


In [209]:
merge_2.to_csv("/gpfs/gibbs/pi/zhao/zc354/GRN/output/topk_non_filter_merge.csv",index=False,sep=',')