In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
PKA_FEATURES = ['RPCS', 'angle_R1X1R2', 'angle_X2X1R1', 'angle_X1X2R2', 'angle_R2X2R1', 
                'distance_between_atoms_in_cycle_and_f_group', 
                'distance_between_atoms_in_f_group_centers', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5',
                'avg_atoms_in_cycle', 'nHRing', 'cis/trans', 'FPSA3', 'nF', 'chirality',
                'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 'mol_num_cycles',
                'f_freedom', 'nFRing', 'identificator', 'nO', 'nARing', 'nC', 'nFHRing',
                'f_to_fg']

In [5]:
import pandas as pd

train = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler_canon_smiles\pKa\train_pka_minmax_only_non_categorizal_features_scaled.csv', index_col=0)

y = train['pKa']
X=train[PKA_FEATURES]

test = pd.read_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler_canon_smiles\pKa\test_pka_minmax_only_non_categorizal_features_scaled.csv', index_col=0)
y_test = test['pKa']
X_test = test[PKA_FEATURES]

In [7]:
cv_indices_dict = {0: [], 1: []}
index = 0
for _, row in train.iterrows():
    cv_indices_dict[row['fold_id']].append(index)
    index += 1
cv_indices = [[cv_indices_dict[0], cv_indices_dict[1]], [cv_indices_dict[1], cv_indices_dict[0]]]

In [8]:
from sklearn import linear_model

amount_of_cv = 2
for cv_number in range(amount_of_cv):
    cv_X_train = X.iloc[cv_indices[cv_number][0]]
    cv_y_train = y.iloc[cv_indices[cv_number][0]]

    cv_X_test = X.iloc[cv_indices[cv_number][1]]
    cv_y_test = y.iloc[cv_indices[cv_number][1]]

    regr = linear_model.LinearRegression()

    regr.fit(cv_X_train, cv_y_train)

    y_pred = regr.predict(cv_X_test)
    print("OOS:", calculate_metrics(y_pred, cv_y_test))

    y_pred = regr.predict(cv_X_train)
    print("Train:", calculate_metrics(y_pred, cv_y_train))

OOS: {'mse': 0.785, 'mae': 0.719, 'r^2': 0.879}
Train: {'mse': 0.288, 'mae': 0.438, 'r^2': 0.956}
OOS: {'mse': 0.797, 'mae': 0.718, 'r^2': 0.856}
Train: {'mse': 0.235, 'mae': 0.36, 'r^2': 0.962}


In [9]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X, y)

y_pred = regr.predict(X_test)

print("Test:", calculate_metrics(y_pred, y_test))

y_pred = regr.predict(X)
print("Train:", calculate_metrics(y_pred, y))

Test: {'mse': 0.527, 'mae': 0.608, 'r^2': 0.891}
Train: {'mse': 0.342, 'mae': 0.475, 'r^2': 0.945}
