In [15]:
import scipy.stats
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
regression_results = pd.read_csv("../results/regr_results_fingerprints_plus_feat.csv", index_col=0)
regression_results

Unnamed: 0,name_short,adduct,matrix,polarity,regressor,observed_value,prediction
0,2-Oxoglutaric acid,+Cl,9AA,negative,Lin_reg,0.000000,8.921198e+14
1,3-Hydroxymethylglutaric acid,+Cl,9AA,negative,Lin_reg,0.000000,9.770028e+17
2,3-Phosphoglyceric acid,+Cl,9AA,negative,Lin_reg,0.000000,-2.075961e+17
3,"4,5-Dihydroorotic acid",+Cl,9AA,negative,Lin_reg,160.795550,-5.418741e+18
4,4-Hydroxyproline,+Cl,9AA,negative,Lin_reg,46.253326,4.089315e+18
...,...,...,...,...,...,...,...
212935,Urocanic acid,[M]-,pNA,positive,GaussianProcess,0.000000,3.578398e+03
212936,Valine,[M]-,pNA,positive,GaussianProcess,0.000000,3.793842e+03
212937,Xanthine,[M]-,pNA,positive,GaussianProcess,0.000000,3.498633e+03
212938,alpha-tocopherol,[M]-,pNA,positive,GaussianProcess,0.000000,3.863232e+03


In [13]:
# compute Spearman's/Pearson's correlation and mean squared error for each matrix/polarity/regressor
regression_metrics = pd.DataFrame(columns = ['matrix', 'polarity', 'regressor', "Spearman's R", 
                                             'S pval', "Pearson's R", 'P pval', 'RMSE', 'RMSE/std', 'non-zero obs'])
counter = 0
for (matrix, polarity, regressor), rows in regression_results.groupby(['matrix', 'polarity', 'regressor']):
    # remove zero intensity molecules if needed (optional)
    rows = rows[rows['observed_value']!=0]
    
    spearman = scipy.stats.spearmanr(rows.observed_value, rows.prediction)
    pearson = scipy.stats.pearsonr(rows.observed_value, rows.prediction)
    mse = mean_squared_error(rows.observed_value, rows.prediction, squared = False)
    mse_std = mse / rows['observed_value'].std()
    regression_metrics.loc[counter] = [matrix, polarity, regressor, spearman[0], spearman[1], 
                                       pearson[0], pearson[1], mse, mse_std, 
                                       rows[rows['observed_value']!=0].shape[0]]
    counter += 1

In [14]:
# select best regressor for each matrix/polarity combination
best_RMSE = regression_metrics.loc[regression_metrics.groupby(['matrix', 'polarity'])["RMSE/std"].idxmin()]
#best_RMSE

best_spear = regression_metrics.loc[regression_metrics.groupby(['matrix', 'polarity'])["Spearman's R"].idxmax()].sort_values("Spearman's R", ascending=False)
best_spear

Unnamed: 0,matrix,polarity,regressor,Spearman's R,S pval,Pearson's R,P pval,RMSE,RMSE/std,non-zero obs
5,9AA,negative,RandomForest,0.692942,3.219078e-25,0.530198,1.723145e-13,148592.854824,0.86031,167
158,NOR,positive,RandomForest,0.580814,8.211518e-16,0.227513,0.003811961,57574.114521,1.067602,160
14,9AA,positive,RandomForest,0.57214,2.731983e-10,0.256327,0.008962025,105883.817809,1.05487,103
149,NOR,negative,RandomForest,0.565692,5.598231e-14,0.104896,0.2029631,65607.306746,1.17638,149
61,ClCCA,negative,SVR_poly,0.549635,3.079092e-06,0.300261,0.01680353,2359.122166,1.073378,63
140,NEDC,positive,RandomForest,0.542057,1.443889e-07,-0.002423,0.9827601,10113.246541,1.102239,82
41,CMBT,negative,RandomForest,0.537216,1.642883e-05,0.237864,0.07479523,4338.398374,1.024182,57
72,DAN,negative,DecisionTree,0.527756,1.711288e-12,0.144588,0.07265787,24225.584574,1.142047,155
86,DAN,positive,RandomForest,0.526301,2.742956e-16,0.340233,4.646221e-07,25459.089768,1.032479,209
115,DHB,negative,SVR_poly,0.504713,9.768259e-06,0.382905,0.001165261,13597.651925,1.12254,69
