In [1]:
# import packages
import pandas as pd
from scipy import stats

In [2]:
# finding datasets from ROBERT
dft_test_df = pd.read_csv('../Data/ROBERT_Results/DFT/PREDICT/MVL_85_test_No_PFI.csv')
dft_valid_df = pd.read_csv('../Data/ROBERT_Results/DFT/PREDICT/MVL_85_valid_No_PFI.csv')
dft_train_df = pd.read_csv('../Data/ROBERT_Results/DFT/PREDICT/MVL_85_train_No_PFI.csv')

aqme_test_df = pd.read_csv('../Data/ROBERT_Results/Semi-Empirical/PREDICT/MVL_85_test_No_PFI.csv')
aqme_valid_df = pd.read_csv('../Data/ROBERT_Results/Semi-Empirical/PREDICT/MVL_85_train_No_PFI.csv')
aqme_train_df = pd.read_csv('../Data/ROBERT_Results/Semi-Empirical/PREDICT/MVL_85_valid_No_PFI.csv')

In [3]:
# combining training, validataion, and test sets
dft_df = pd.concat([dft_test_df,dft_train_df,dft_valid_df])
aqme_df = pd.concat([aqme_test_df,aqme_train_df,aqme_valid_df])

# sorting the dataframes
dft_df = dft_df.sort_values(by=['Cofactor'], ascending=True)
aqme_df = aqme_df.sort_values(by=['code_name'], ascending=True)

In [4]:
# adding DFT values to a dataframe
dG_db = dft_df[['Cofactor','dG_C5','dG_C5_pred']]
dG_df = dG_db.rename(columns={"dG_C5" : "Calculated dG_C5", "dG_C5_pred" : "DFT Predicted dG_C5"})

# adding Semi-Empirical values to the dataframe
aqme_df['Semi-Empirical Predicted dG_C5'] = aqme_df['dG_C5_pred']
aqme_preds = aqme_df['Semi-Empirical Predicted dG_C5'].tolist()

dG_df['Semi-Empirical Predicted dG_C5'] = aqme_preds

In [5]:
# renaming the dataframes with our naming scheme
r1 = ['A','B','D','C','E','J','F','G','H','K','I']
r2 = ['1','2','5','3','4','6']
r3 = ['a','b']

names = []

for i in r1:
    for j in r2:
        for k in r3:
            names.append(i + j + k)

dG_df_named = dG_df.drop(columns=['Cofactor'],axis=1)
dG_df_named['Cofactor'] = names

In [6]:
# finding Spearman correlation data
dft_pred_corr = stats.spearmanr(dG_df['DFT Predicted dG_C5'], dG_df['Calculated dG_C5'])
aqme_pred_corr = stats.spearmanr(dG_df['Semi-Empirical Predicted dG_C5'], dG_df['Calculated dG_C5'])

print(f'DFT Model: {dft_pred_corr}')
print(f'Semi-Empirical Model: {aqme_pred_corr}')

DFT Model: SignificanceResult(statistic=0.97378595691171, pvalue=1.7976473699451722e-85)
Semi-Empirical Model: SignificanceResult(statistic=0.9145956494289156, pvalue=5.8005918422643295e-53)


In [8]:
# calculating experimental spearman correlations
tan_names = ['D1a','B1a','B1b','F1a','F1b']
tan_exp_rank = [2,1,4,3,5]
tan_dft_rank = [2,1,4,3,5]
tan_pred_rank = [2,1,4,3,5]

nowak_names = ['A1a','F1a','G1a','H1a']
nowak_exp_rank = [1,3,4,2]
nowak_dft_rank = [2,4,1,3]
nowak_pred_rank = [2,4,3,1]

tan_spearman = stats.spearmanr(tan_pred_rank, tan_exp_rank)
nowak_spearman = stats.spearmanr(nowak_pred_rank, nowak_exp_rank)

print(f'Tan: {tan_spearman}')
print(f'Nowak: {nowak_spearman}')

Tan: SignificanceResult(statistic=0.9999999999999999, pvalue=1.4042654220543672e-24)
Nowak: SignificanceResult(statistic=0.6000000000000001, pvalue=0.3999999999999999)
