In [8]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [1]:
PKA_FEATURES = ['RPCS', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5',
                'avg_atoms_in_cycle', 'nHRing', 'cis/trans', 'FPSA3', 'nF', 'chirality',
                'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 'mol_num_cycles',
                'f_freedom', 'nFRing', 'identificator', 'nO', 'nARing', 'nC', 'nFHRing',
                'f_to_fg']

In [10]:
import pandas as pd

train = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler\pKa\train_pka_minmax.csv', index_col=0)

y = train['pKa']
X=train[PKA_FEATURES]

test = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler\pKa\test_pka_minmax.csv', index_col=0)
y_test = test['pKa']
X_test = test[PKA_FEATURES]

In [3]:
cv_indices_dict = {0: [], 1: []}
index = 0
for _, row in train.iterrows():
    cv_indices_dict[row['fold_id']].append(index)
    index += 1
cv_indices = [[cv_indices_dict[0], cv_indices_dict[1]], [cv_indices_dict[1], cv_indices_dict[0]]]

In [22]:
from sklearn import linear_model

amount_of_cv = 2
for cv_number in range(amount_of_cv):
    cv_X_train = X.iloc[cv_indices[cv_number][0]]
    cv_y_train = y.iloc[cv_indices[cv_number][0]]

    cv_X_test = X.iloc[cv_indices[cv_number][1]]
    cv_y_test = y.iloc[cv_indices[cv_number][1]]

    regr = linear_model.LinearRegression()

    regr.fit(cv_X_train, cv_y_train)

    y_pred = regr.predict(cv_X_test)
    print("OOS:", calculate_metrics(y_pred, cv_y_test))

    y_pred = regr.predict(cv_X_train)
    print("Train:", calculate_metrics(y_pred, cv_y_train))

OOS: {'mse': 0.712, 'mae': 0.628, 'r^2': 0.875}
Train: {'mse': 0.295, 'mae': 0.441, 'r^2': 0.953}
OOS: {'mse': 0.644, 'mae': 0.601, 'r^2': 0.911}
Train: {'mse': 0.261, 'mae': 0.379, 'r^2': 0.958}


In [20]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X, y)

y_pred = regr.predict(X_test)

print("Test:", calculate_metrics(y_pred, y_test))

y_pred = regr.predict(X)
print("Train:", calculate_metrics(y_pred, y))

Test: {'mse': 0.728, 'mae': 0.698, 'r^2': 0.887}
Train: {'mse': 0.342, 'mae': 0.464, 'r^2': 0.944}


logP

In [31]:
import pandas as pd

train = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler\logP\train_logp_minmax.csv', index_col=0)

y = train['logP']
X=train.copy()
X = X.drop(['fold_id', 'logP'], axis=1)

test = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler\logP\test_logp_minmax.csv', index_col=0)
y_test = test['logP']
X_test = test.copy()
X_test = X_test.drop(['logP'], axis=1)

In [33]:
cv_indices_dict = {0: [], 1: []}
index = 0
for _, row in train.iterrows():
    cv_indices_dict[row['fold_id']].append(index)
    index += 1
cv_indices = [[cv_indices_dict[0], cv_indices_dict[1]], [cv_indices_dict[1], cv_indices_dict[0]]]

In [39]:
from sklearn import linear_model

amount_of_cv = 2
r_score, mse, mae = 0, 0, 0
for cv_number in range(amount_of_cv):
    cv_X_train = X.iloc[cv_indices[cv_number][0]]
    cv_y_train = y.iloc[cv_indices[cv_number][0]]

    cv_X_test = X.iloc[cv_indices[cv_number][1]]
    cv_y_test = y.iloc[cv_indices[cv_number][1]]

    regr = linear_model.LinearRegression()

    regr.fit(cv_X_train, cv_y_train)

    y_pred = regr.predict(cv_X_test)
    print("OOS:", calculate_metrics(y_pred, cv_y_test))

    r_score += calculate_metrics(y_pred, cv_y_test)['r^2']
    mse += calculate_metrics(y_pred, cv_y_test)['mse']
    mae += calculate_metrics(y_pred, cv_y_test)['mae']

    y_pred = regr.predict(cv_X_train)
    print("Train:", calculate_metrics(y_pred, cv_y_train))

print(f"cv r^2: {r_score / 2}") 
print(f"cv mse: {mse / 2}") 
print(f"cv mae: {mae / 2}") 

OOS: {'mse': 0.061, 'mae': 0.189, 'r^2': 0.812}
Train: {'mse': 0.045, 'mae': 0.16, 'r^2': 0.846}
OOS: {'mse': 0.078, 'mae': 0.214, 'r^2': 0.742}
Train: {'mse': 0.017, 'mae': 0.103, 'r^2': 0.939}
cv r^2: 0.777
cv mse: 0.0695
cv mae: 0.2015


In [36]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X, y)

y_pred = regr.predict(X_test)

print("Test:", calculate_metrics(y_pred, y_test))

y_pred = regr.predict(X)
print("Train:", calculate_metrics(y_pred, y))

Test: {'mse': 0.025, 'mae': 0.119, 'r^2': 0.908}
Train: {'mse': 0.038, 'mae': 0.148, 'r^2': 0.865}
