# Generate graph withh all ROC curves

In [None]:
# import libraries
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
import numpy as np
import prolif as plf
import math
import json
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import matplotlib.pyplot as plt


In [None]:
bonds_vina = {'MET153.A_Hydrophobic': 4.485348493097691, 'MET156.A_VdWContact': -0.0005225441441997642, 'PHE368.A_Hydrophobic': -1.472953537108647, 'LYS105.A_VdWContact': 5.179949668510255, 'TYR155.A_Hydrophobic': 3.0517960554018715, 'LEU205.A_Hydrophobic': 1.3990129846678374, 'MET153.A_VdWContact': -1.0208939152633199, 'PHE87.A_VdWContact': -4.256997923875001, 'GLU154.A_VdWContact': -5.249035130690076, 'LYS105.A_PiCation': 0.8233262096908224, 'ASP216.A_VdWContact': -2.2292151597256336, 'PHE87.A_Hydrophobic': 0.8349236708553064, 'VAL137.A_Hydrophobic': 0.4925871518824879, 'PHE368.A_VdWContact': 1.9889970949281492, 'GLY88.A_VdWContact': -0.38220713404859974, 'ALA215.A_VdWContact': 1.6887577179767912, 'PHE120.A_Hydrophobic': 2.737170669142102, 'PHE120.A_VdWContact': 0.2, 'VAL90.A_Hydrophobic': -1.5213595337085197, 'VAL137.A_VdWContact': 2.24091184166182, 'PHE120.A_PiStacking': 4.060579265482197, 'ASP160.A_VdWContact': -1.1894967803254866, 'ALA86.A_VdWContact': 3.3919021465071424, 'VAL90.A_VdWContact': 0.4028038880884586, 'ASP202.A_VdWContact': 0.16800468220803028, 'ASN203.A_VdWContact': 0.3563418978956986, 'GLU89.A_VdWContact': 0.08, 'ILE82.A_VdWContact': -0.9645744605196063, 'ALA103.A_VdWContact': 3.6451498899135246, 'MET128.A_Hydrophobic': 0.04, 'ILE82.A_Hydrophobic': -1.5229050515789935, 'LYS200.A_VdWContact': 0.04, 'SER116.A_VdWContact': 2.9333053112036795, 'SER118.A_VdWContact': -3.158856124832565, 'LEU205.A_VdWContact': -0.07686935506500725}
bonds_glide = {'MET153.A_Hydrophobic': 0.84, 'MET156.A_VdWContact': 4.38647000624677, 'PHE368.A_Hydrophobic': 0.84, 'LYS105.A_VdWContact': 5.945898068663053, 'TYR155.A_Hydrophobic': 1.7772209283676828, 'LEU205.A_Hydrophobic': 3.6575552174288277, 'MET153.A_VdWContact': -1.64681891801749, 'PHE87.A_VdWContact': -0.9715120305947635, 'GLU154.A_VdWContact': 2.525721827497242, 'LYS105.A_PiCation': 8.28901702882621, 'ASP216.A_VdWContact': 0.37748685695342554, 'PHE87.A_Hydrophobic': 1.3372209283676828, 'VAL137.A_Hydrophobic': -1.8709829965975266, 'PHE368.A_VdWContact': -1.3194280624162824, 'GLY88.A_VdWContact': 1.2636909128609273, 'ALA215.A_VdWContact': 0.2, 'PHE120.A_Hydrophobic': -1.3994280624162825, 'PHE120.A_VdWContact': 2.4688780984537737, 'VAL90.A_Hydrophobic': -4.0099294855315035, 'VAL137.A_VdWContact': 0.16, 'PHE120.A_PiStacking': 0.16, 'ASP160.A_VdWContact': -2.450179005857511, 'ALA86.A_VdWContact': 0.8379453487529984, 'VAL90.A_VdWContact': 0.12, 'ASP202.A_VdWContact': -3.109598015506755, 'ASN203.A_VdWContact': -1.0942219755365032, 'GLU89.A_VdWContact': 0.08, 'ILE82.A_VdWContact': -2.490179005857511, 'ALA103.A_VdWContact': 3.7203398885308387, 'MET128.A_Hydrophobic': 0.04, 'ILE82.A_Hydrophobic': 0.04, 'LYS200.A_VdWContact': 2.693172001621254, 'SER116.A_VdWContact': 0.04, 'SER118.A_VdWContact': 0.04, 'LEU205.A_VdWContact': 0.04}

In [None]:
# read protein molecule to calculate IFs on (save to PDBBlock)
protein_file = "../materials/2etr.pdb"
rdkit_prot = Chem.MolFromPDBFile(protein_file, removeHs=False)
protein = plf.Molecule(rdkit_prot)

In [None]:

def get_score(row, bonds, docked_score, coef1, coef2):
    bonds_found = row[row == True].index.tolist()
    common_bonds = set(bonds.keys()) & set(bonds_found)
    extra_bonds = set(bonds_found).difference(set(bonds.keys())) # bonds that are not "supposed" to be there (are in docked, not in aligned)
    not_found = set(bonds.keys()).difference(set(bonds_found)) # bonds that have a "score" but werent found

    extra_bonds_penalty = 0
    not_found_penalty = 0

    common_sum = 0
    total_sum = 0
    for bond, num in bonds.items():
        total_sum = total_sum + num 
        if bond in common_bonds:
            common_sum = common_sum + num 
        elif bond in extra_bonds:
            common_sum = common_sum - extra_bonds_penalty
        elif bond in not_found:
            common_sum = common_sum - not_found_penalty

    score_temp = (common_sum / total_sum) * 10
    
    score = coef1*score_temp - coef2*docked_score 

    return score


### Vina scores

In [None]:
# actives
actives_scores = {}
for x in range(9):
    df_actives = pd.read_csv(f"../materials/actives_decoys/ifs_vina/actives/actives_ifs_{x}.csv")
    for idx, row in df_actives.iterrows():
        mol_id = row["idx"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_vina, docked_score, 0.0, 1.0)
        if mol_id in actives_scores:
            if actives_scores[mol_id] < score:
                actives_scores[mol_id] = score
        else:
            actives_scores[mol_id] = score

#print(actives_scores)
a_scores_vina_original = [float(score[1]) for score in actives_scores.items()]
print(a_scores_vina_original)

actives_scores = {}
for x in range(9):
    df_actives = pd.read_csv(f"../materials/actives_decoys/ifs_vina/actives/actives_ifs_{x}.csv")
    for idx, row in df_actives.iterrows():
        mol_id = row["idx"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_vina, docked_score, 0.3, 0.7)
        if mol_id in actives_scores:
            if actives_scores[mol_id] < score:
                actives_scores[mol_id] = score
        else:
            actives_scores[mol_id] = score

#print(actives_scores)
a_scores_vina_rescored = [float(score[1]) for score in actives_scores.items()]
print(a_scores_vina_rescored)

In [None]:
# decoys
decoy_scores = {}
for x in range(57):
    df_decoys = pd.read_csv(f"../materials/actives_decoys/ifs_vina/decoys/decoys_ifs_{x}.csv")
    for idx, row in df_decoys.iterrows():
        mol_id = row["idx"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_vina, docked_score, 0.0, 1.0)
        if mol_id in decoy_scores:
            if decoy_scores[mol_id] < score:
                decoy_scores[mol_id] = score
        else:
            decoy_scores[mol_id] = score

#print(decoy_scores)
d_scores_vina_original = [float(score[1]) for score in decoy_scores.items()]
print(d_scores_vina_original)

decoy_scores = {}
for x in range(57):
    df_decoys = pd.read_csv(f"../materials/actives_decoys/ifs_vina/decoys/decoys_ifs_{x}.csv")
    for idx, row in df_decoys.iterrows():
        mol_id = row["idx"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_vina, docked_score, 0.3, 0.7)
        if mol_id in decoy_scores:
            if decoy_scores[mol_id] < score:
                decoy_scores[mol_id] = score
        else:
            decoy_scores[mol_id] = score

#print(decoy_scores)
d_scores_vina_rescored = [float(score[1]) for score in decoy_scores.items()]
print(d_scores_vina_rescored)

## Glide

In [None]:
# actives

actives_scores = {}
for x in range(24):
    df_actives = pd.read_csv(f"../materials/actives_decoys/ifs_glide/actives/actives_{x}.csv")
    for idx, row in df_actives.iterrows():
        mol_id = row["mol_id"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_glide, docked_score, 0.0, 1.0)
        if mol_id in actives_scores:
            if actives_scores[mol_id] < score:
                actives_scores[mol_id] = score
        else:
            actives_scores[mol_id] = score

# print(actives_scores)
a_scores_glide_original = [float(score[1]) for score in actives_scores.items()]
print(a_scores_glide_original)

actives_scores = {}
for x in range(24):
    df_actives =  pd.read_csv(f"../materials/actives_decoys/ifs_glide/actives/actives_{x}.csv")
    for idx, row in df_actives.iterrows():
        mol_id = row["mol_id"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_glide, docked_score, 0.1, 0.9)
        if mol_id in actives_scores:
            if actives_scores[mol_id] < score:
                actives_scores[mol_id] = score
        else:
            actives_scores[mol_id] = score

# print(actives_scores)
a_scores_glide_rescored = [float(score[1]) for score in actives_scores.items()]
print(a_scores_glide_rescored)

In [None]:
# decoys
decoy_scores = {}
for x in range(93):
    df_decoys =  pd.read_csv(f"../materials/actives_decoys/ifs_glide/decoys/decoys_{x}.csv")
    for idx, row in df_decoys.iterrows():
        mol_id = row["mol_id"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_glide, docked_score, 0.0, 1.0)
        if mol_id in decoy_scores:
            if decoy_scores[mol_id] < score:
                decoy_scores[mol_id] = score
        else:
            decoy_scores[mol_id] = score

# print(decoy_scores)
d_scores_glide_original = [float(score[1]) for score in decoy_scores.items()]
print(d_scores_glide_original)

decoy_scores = {}
for x in range(93):
    df_decoys =  pd.read_csv(f"../materials/actives_decoys/ifs_glide/decoys/decoys_{x}.csv")
    for idx, row in df_decoys.iterrows():
        mol_id = row["mol_id"]
        docked_score = row["docked_score"]
        score = get_score(row, bonds_glide, docked_score, 0.1, 0.9)
        if mol_id in decoy_scores:
            if decoy_scores[mol_id] < score:
                decoy_scores[mol_id] = score
        else:
            decoy_scores[mol_id] = score

# print(decoy_scores)
d_scores_glide_rescored = [float(score[1]) for score in decoy_scores.items()]
print(d_scores_glide_rescored)

## Graph

In [None]:
plt.figure(0).clf()

total_scores = a_scores_vina_original + d_scores_vina_original
labels = [1]*len(a_scores_vina_original)+[0]*len(d_scores_vina_original)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
auc = round(roc_auc_score(labels,total_scores), 3)
fpr,tpr,_ = roc_curve(labels,total_scores)
plt.plot(fpr,tpr,label="Vina-Orig, AUC = " + str(auc), color="#f0e442")
#-

total_scores = a_scores_vina_rescored + d_scores_vina_rescored
labels = [1]*len(a_scores_vina_rescored)+[0]*len(d_scores_vina_rescored)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
auc = round(roc_auc_score(labels,total_scores), 3)
fpr,tpr,_ = roc_curve(labels,total_scores)
plt.plot(fpr,tpr,label="Vina-Resc, AUC = " + str(auc), color="#0072b2")
#-

total_scores = a_scores_glide_original + d_scores_glide_original
labels = [1]*len(a_scores_glide_original)+[0]*len(d_scores_glide_original)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
auc = round(roc_auc_score(labels,total_scores), 3)
fpr,tpr,_ = roc_curve(labels,total_scores)
plt.plot(fpr,tpr,label="Glide-Orig, AUC = " + str(auc), color="#d55e00")
# -

total_scores = a_scores_glide_rescored + d_scores_glide_rescored
labels = [1]*len(a_scores_glide_rescored)+[0]*len(d_scores_glide_rescored)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
auc = round(roc_auc_score(labels,total_scores), 3)
fpr,tpr,_ = roc_curve(labels,total_scores)
plt.plot(fpr,tpr,label="Glide-Resc, AUC = " + str(auc), color="#009e73")
#-

plt.plot([i for i in range(2)],[i for i in range(2)],label="random", color = "#808080")


tick_values_x = [1e-2, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
tick_labels_x = [f"{100 * x:.4g}" for x in tick_values_x] 
plt.xticks(tick_values_x, tick_labels_x)

tick_values_y = [1e-2, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
tick_labels_y = [f"{100 * x:.4g}" for x in tick_values_y] 
plt.yticks(tick_values_y, tick_labels_y)

plt.grid()

plt.xlabel("FPR (%)")
plt.ylabel("TPR (%)")
plt.title("ROC curve")
plt.legend()
plt.show()



In [None]:
percent = 0.01
rounding = 4
plt.figure(0).clf()

def get_ef(scores, labels, percent):   
    scores = np.array(scores)
    labels = np.array(labels)
    sorted_indices = np.argsort(scores)[::-1]
    sorted_labels = labels[sorted_indices]
    # print(sorted_labels)

    mols = len(sorted_labels)
    act = len([i for i in sorted_labels if i == 1])

    mols1 = int(mols * percent)
    act1_list = sorted_labels[:mols1]
    act1 = len([i for i in act1_list if i == 1])

    ef = (act1 / mols1) / (act / mols)
    # ef = act1 / act
    return ef


total_scores = a_scores_vina_original + d_scores_vina_original
labels = [1]*len(a_scores_vina_original)+[0]*len(d_scores_vina_original)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
fpr,tpr,_ = roc_curve(labels,total_scores)

ef = get_ef(total_scores, labels, percent)
plt.plot(fpr,tpr,label=f"Vina-Orig, EF({int(percent*100)}%) = " + str(round(ef, rounding)), color="#f0e442")
#-

total_scores = a_scores_vina_rescored + d_scores_vina_rescored
labels = [1]*len(a_scores_vina_rescored)+[0]*len(d_scores_vina_rescored)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
fpr,tpr,_ = roc_curve(labels,total_scores)

ef = get_ef(total_scores, labels, percent)
plt.plot(fpr,tpr,label=f"Vina-Resc, EF({int(percent*100)}%) = " + str(round(ef, rounding)), color="#0072b2")
#-

total_scores = a_scores_glide_original + d_scores_glide_original
labels = [1]*len(a_scores_glide_original)+[0]*len(d_scores_glide_original)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
fpr,tpr,_ = roc_curve(labels,total_scores)

ef = get_ef(total_scores, labels, percent)
plt.plot(fpr,tpr,label=f"Glide-Orig, EF({int(percent*100)}%) = " + str(round(ef, rounding)), color="#d55e00")
# -

total_scores = a_scores_glide_rescored + d_scores_glide_rescored
labels = [1]*len(a_scores_glide_rescored)+[0]*len(d_scores_glide_rescored)
total_scores = [t for t in total_scores]  # auc needs negative values (new scores are positive)
fpr,tpr,_ = roc_curve(labels,total_scores)

ef = get_ef(total_scores, labels, percent)
plt.plot(fpr,tpr,label=f"Glide-Resc, EF({int(percent*100)}%) = " + str(round(ef, rounding)), color="#009e73")
#-




plt.plot([i for i in range(2)],[i for i in range(2)],label="random", color = "#808080")

tick_values_x = [1e-2, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
tick_labels_x = [f"{100 * x:.4g}" for x in tick_values_x] 
plt.xticks(tick_values_x, tick_labels_x)

tick_values_y = [1e-2, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0]
tick_labels_y = [f"{100 * x:.4g}" for x in tick_values_y] 
plt.yticks(tick_values_y, tick_labels_y)

plt.grid()

#x_min, x_max, y_min, y_max
plt.axis([-0.01, 0.2, 0.0, 0.5])

plt.xlabel("FPR (%)")
plt.ylabel("TPR (%)")
plt.title("ROC curve")
plt.legend()
plt.show()

