In [1]:
from __future__ import division, absolute_import

import sys
import random
import pickle

import pandas as pd
import numpy as np
import h5py
from plotnine import *
import seaborn as sns

#root
absPath = '/home/angela3/imbalance_pcm_benchmark/'
sys.path.insert(0, absPath)

from src.Target import Target

np.random.seed(8)
random.seed(8)

Using TensorFlow backend.


In [2]:
protein_type = "kinases"
nfolds = 10

In [3]:
#Loading list of unique proteins (only those were SMOTE could be applied)
with open("".join((absPath, "data/", protein_type, "/smote_prots.pickle")), 'rb') as handle:
    unique_prots = pickle.load(handle)

In [4]:
strategies = ["resampling_before_clustering", "no_resampling", "resampling_after_clustering", "semi_resampling"]

ratios_df_completo = pd.read_csv("".join((absPath, "data/", protein_type, "/ratios_df_complete.csv")))

In [5]:
ratios_df_completo.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [6]:
ratios_df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9701 entries, 0 to 9700
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   DeepAffinity Protein ID  9701 non-null   object 
 1   ratio_training           9503 non-null   float64
 2   ratio_test               9701 non-null   float64
 3   ratio_test_predicted     9701 non-null   float64
 4   acc                      9701 non-null   float64
 5   auroc                    7525 non-null   float64
 6   f1                       9701 non-null   float64
 7   balanced_acc             9701 non-null   float64
 8   mcc                      9701 non-null   float64
 9   strategy                 9701 non-null   object 
 10  fold                     9701 non-null   int64  
 11  Sequence                 9701 non-null   object 
 12  family                   9701 non-null   object 
 13  Uniprot ID               9701 non-null   object 
 14  n_interactions          

In [7]:
ratios_df_completo.head()

Unnamed: 0,DeepAffinity Protein ID,ratio_training,ratio_test,ratio_test_predicted,acc,auroc,f1,balanced_acc,mcc,strategy,fold,Sequence,family,Uniprot ID,n_interactions,len_seq
0,DP71,0.521739,0.0,1.0,0.0,,0.0,0.0,0.0,resampling_before_clustering,0,MATITCTRFTEEYQLFEELGKGAFSVVRRCVKVLAGQEYAAKIINT...,PK,P11275,32,478
1,R7P7,0.536913,0.354167,0.458333,0.395833,0.368121,0.25641,0.372865,-0.244063,resampling_before_clustering,0,MADSGLDKKSTKCPDCSSASQKDVLCVCSSKTRVPPVLVVEMSQTS...,PK,Q9BYT3,217,514
2,5HXY,0.498744,0.821429,0.821429,0.857143,0.947826,0.913043,0.756522,0.513043,resampling_before_clustering,0,MAPFLRIAFNSYELGSLQAEDEANQPFCAVKMKEALSTERGKTLVQ...,PK,Q05655,578,676
3,5JM5,0.441327,0.571429,1.0,0.571429,0.583333,0.727273,0.5,0.0,resampling_before_clustering,0,MVVFNGLLKIKICEAVSLKPTAWSLRHAVGPRPQTFLLDPYIALNV...,PK,Q02156,336,737
4,8XS2,0.566613,0.222997,0.655052,0.54007,0.824797,0.47619,0.681754,0.318318,resampling_before_clustering,0,MSPFLRIGLSNFDCGSCQSCQGEAVNPYCAVLVKEYVESENGQMYI...,PK,Q04759,692,706


In [8]:
# we should not use n_interactions because that the TOTAL NUMBER of interactions, but we are interested only in test!
# we have to load te corresponding prediction on test file, check the shape and take this number as n_interactions

In [9]:
def computing_random_baseline(ratios_df_completo, strategy, prot, fold):
    subdf = ratios_df_completo[(ratios_df_completo.strategy==strategy) & (ratios_df_completo.fold == fold)]
    if subdf[subdf["DeepAffinity Protein ID"]==prot].shape[0] == 0:
        return None, None
    ratio_training = subdf.loc[subdf["DeepAffinity Protein ID"]==prot, "ratio_training"].dropna().values.mean()
    if np.isnan(ratio_training):
        return None, None
    pred_test_path = "".join((absPath, "data/", protein_type, "/", strategy, "/predictions/", str(fold), "/test.csv"))
    predtest_loaded = pd.read_csv(pred_test_path, index_col=False)
    predtest_loaded["DeepAffinity Protein ID"] = predtest_loaded["DeepAffinity Protein ID"].str.replace("b\'", "")
    predtest_loaded["DeepAffinity Protein ID"] = predtest_loaded["DeepAffinity Protein ID"].str.replace("\'", "")
    n_interactions = predtest_loaded[predtest_loaded["DeepAffinity Protein ID"]==prot].shape[0]
    #print(n_interactions)
    #n_interactions = int(subdf.loc[subdf["DeepAffinity Protein ID"]==prot, "n_interactions"].dropna().values.mean())
    #comps = predtest_loaded.loc[predtest_loaded["DeepAffinity Protein ID"]==prot, "comp_ID"].values.tolist()
    n_actives = int(round(ratio_training*n_interactions, 0))
    #print(n_actives)
    n_inactives = n_interactions-n_actives  
    #print(n_inactives)
    active_rdm_comps = [random.uniform(0.5, 1) for r in range(n_actives)]
    inactive_rdm_comps = [random.uniform(0, 0.49) for r in range(n_inactives)]
    rdm_comps = active_rdm_comps + inactive_rdm_comps
    random.shuffle(rdm_comps) 
    return rdm_comps#, comps

In [10]:
strategy = "no_resampling"
fold = 0
pred_test_path = "".join((absPath, "data/", protein_type, "/", strategy, "/predictions/", str(fold), "/test.csv"))
predtest_loaded = pd.read_csv(pred_test_path, index_col=False)
predtest_loaded

Unnamed: 0.1,Unnamed: 0,y_test,y_prob,y_pred,comp_ID,DeepAffinity Protein ID
0,0,1,0.608706,1,b'35fs',b'K57Q'
1,1,1,0.624620,1,b'dj3m',b'K57Q'
2,2,1,0.636248,1,b'n4d6',b'K57Q'
3,3,1,0.647965,1,b'w2t3',b'K57Q'
4,4,0,0.634567,1,b'au3u',b'K57Q'
...,...,...,...,...,...,...
13895,13895,1,0.788921,1,b'vs4t',b'OZTK'
13896,13896,1,0.603204,1,b'4g5m',b'5XFM'
13897,13897,0,0.675204,1,b'q2e5',b'5XFM'
13898,13898,0,0.718276,1,b'w7qw',b'5XFM'


In [11]:
#predtest_loaded.comp_ID.values.tolist()

In [12]:
list_rdm = []
for strategy in strategies:
    for fold in range(nfolds):
        for prot in unique_prots:
            dict_prot = {}
            rdm_comps = computing_random_baseline(ratios_df_completo, strategy, prot, fold)
            dict_prot["prot"] = prot
            dict_prot["strategy"] = strategy
            dict_prot["random_baseline"] = rdm_comps 
            dict_prot["fold"] = fold
            #dict_prot["comps"] = comps
            dict_prot["idx"] = range(len(rdm_comps))
            list_rdm.append(dict_prot)

  """
  ret = ret.dtype.type(ret / rcount)


In [13]:
random_baseline_df = pd.DataFrame(list_rdm)

In [14]:
list_rdm[5]

{'prot': 'K57Q',
 'strategy': 'resampling_before_clustering',
 'random_baseline': [0.3671664805805803,
  0.15073394529928802,
  0.12125762227469301,
  0.8775340739518169,
  0.7852949221515722,
  0.5114816245166505,
  0.707697027612084,
  0.787084933040394,
  0.2712862261177119,
  0.9342640388293186,
  0.34122525333764325,
  0.22062539542961393,
  0.614584177204365,
  0.820591357021335,
  0.9204948782441222,
  0.6290843635674361,
  0.3944142765230226,
  0.11110724751246348,
  0.06666042107617173,
  0.05627512866666778,
  0.06757374542611476,
  0.14146767113227096,
  0.515021742105011,
  0.4515430362234574,
  0.33746914337759737,
  0.01799978796403743,
  0.7856153501612323,
  0.11013752126377008,
  0.9738687638280399,
  0.1039340731960251,
  0.94353411099005,
  0.9807506545221643,
  0.3312167700083158,
  0.5262913097526168,
  0.25925390540200766,
  0.9930853929199023,
  0.5001219048411167,
  0.8210975439667838,
  0.7794513368990068,
  0.7794568908230504,
  0.7432169278645635,
  0.4312070

In [15]:
random_baseline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12560 entries, 0 to 12559
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   prot             12560 non-null  object
 1   strategy         12560 non-null  object
 2   random_baseline  12560 non-null  object
 3   fold             12560 non-null  int64 
 4   idx              12560 non-null  object
dtypes: int64(1), object(4)
memory usage: 490.8+ KB


In [16]:
random_baseline_df.head()

Unnamed: 0,prot,strategy,random_baseline,fold,idx
0,DP71,resampling_before_clustering,"[0.4715245675588476, 0.6133529296905245]",0,"(0, 1)"
1,R7P7,resampling_before_clustering,"[0.11486540532661438, 0.29464282391840274, 0.5...",0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,5HXY,resampling_before_clustering,"[0.5400697536338446, 0.14671137975311047, 0.57...",0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,5JM5,resampling_before_clustering,"[0.024953842884140917, 0.9082306934923231, 0.7...",0,"(0, 1, 2, 3, 4, 5, 6)"
4,8XS2,resampling_before_clustering,"[0.47564515092160253, 0.9033899342321716, 0.62...",0,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [17]:
list_cols = ["random_baseline", "idx"]
other_cols = list(set(random_baseline_df.columns) - set(list_cols))
exploded = [random_baseline_df[col].explode() for col in list_cols]
exploded_rdm_bsl = pd.DataFrame(dict(zip(list_cols, exploded)))
exploded_rdm_bsl = random_baseline_df[other_cols].merge(exploded_rdm_bsl, how="right", 
                                                        left_index=True, right_index=True)
exploded_rdm_bsl

Unnamed: 0,prot,strategy,fold,random_baseline,idx
0,DP71,resampling_before_clustering,0,0.471525,0
0,DP71,resampling_before_clustering,0,0.613353,1
1,R7P7,resampling_before_clustering,0,0.114865,0
1,R7P7,resampling_before_clustering,0,0.294643,1
1,R7P7,resampling_before_clustering,0,0.548584,2
...,...,...,...,...,...
12557,HHD3,semi_resampling,9,,0
12557,HHD3,semi_resampling,9,,1
12558,T0K7,semi_resampling,9,,0
12558,T0K7,semi_resampling,9,,1


In [18]:
exploded_rdm_bsl.head()

Unnamed: 0,prot,strategy,fold,random_baseline,idx
0,DP71,resampling_before_clustering,0,0.471525,0
0,DP71,resampling_before_clustering,0,0.613353,1
1,R7P7,resampling_before_clustering,0,0.114865,0
1,R7P7,resampling_before_clustering,0,0.294643,1
1,R7P7,resampling_before_clustering,0,0.548584,2


In [19]:
exploded_rdm_bsl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658548 entries, 0 to 12559
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   prot             658548 non-null  object
 1   strategy         658548 non-null  object
 2   fold             658548 non-null  int64 
 3   random_baseline  652374 non-null  object
 4   idx              658548 non-null  object
dtypes: int64(1), object(4)
memory usage: 30.1+ MB


In [20]:
exploded_rdm_bsl.to_csv("".join((absPath, "/data/", protein_type, "/random_baseline.csv")))