In [29]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import torch
import sys
import os
from rdkit import Chem
from rdkit import DataStructs

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [30]:
# All paths need to be set correctly:
results_folder = 'checkpoints/solvation/solvation_1600177098/results_bottom'
results_path = os.path.join(results_folder,'results.csv')
data_path = 'data/solvation/bottom.txt'
chemprop_path = '/data/rsg/chemistry/cbilod/chemprop'
fold_path='/data/rsg/chemistry/cbilod/chemprop/checkpoints/NoFeat_OneMol/sol_noopt-1597338722/fold_0'

In [31]:
results = pd.read_csv(results_path,sep=' ',header=None)
results[0].to_csv(os.path.join(results_folder,'col1.csv'),index=False)
results[1].to_csv(os.path.join(results_folder,'col2.csv'),index=False)
os.system('python '+chemprop_path+'/predict.py --test_path '+os.path.join(results_folder,'col1.csv')+' --checkpoint_dir '+fold_path+' --preds_path '+os.path.join(results_folder,'preds_col1.csv'))
os.system('python '+chemprop_path+'/predict.py --test_path '+os.path.join(results_folder,'col2.csv')+' --checkpoint_dir '+fold_path+' --preds_path '+os.path.join(results_folder,'preds_col2.csv'))

0

In [32]:
preds1 = pd.read_csv(os.path.join(results_folder,'preds_col1.csv'))
preds1 = preds1.rename(columns={"0":"Mol1","Solubility":"Sol1"})
preds2 = pd.read_csv(os.path.join(results_folder,'preds_col2.csv'))
preds2 = preds2.rename(columns={"1":"Mol2","Solubility":"Sol2"})
preds_tot = pd.concat((preds1,preds2),axis=1)

In [33]:
# Statistics
def avg_improvement(df):
    avg = np.mean(df['Sol2']-df['Sol1'])
    return avg
def percent_improved(df):
    percent = np.mean([int(x>0) for x in preds_tot['Sol2']-preds_tot['Sol1']])
    return percent*100
def percent_improved_mae(df,mae=0.788):
    percent = np.mean([int(x>mae) for x in preds_tot['Sol2']-preds_tot['Sol1']])
    return percent*100
def avg_tanimoto(df):
    tanimoto = [DataStructs.FingerprintSimilarity(Chem.RDKFingerprint(Chem.MolFromSmiles(x)),Chem.RDKFingerprint(Chem.MolFromSmiles(y))) for x,y in zip(preds_tot['Mol1'].values,preds_tot['Mol2'].values)]
    return np.mean(tanimoto)

In [34]:
print('-------Summary Statistics-------')
print('Average Improvement: {}'.format(avg_improvement(preds_tot)))
print('Percent Improved: {}%'.format(percent_improved(preds_tot)))
print('Percent Improved Above MAE: {}%'.format(percent_improved_mae(preds_tot)))
print('Average Tanimoto: {}'.format(avg_tanimoto(preds_tot)))

-------Summary Statistics-------
Average Improvement: 2.6364552373599692
Percent Improved: 83.6%
Percent Improved Above MAE: 81.39999999999999%
Average Tanimoto: 0.25807367296254335
