In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def calculate_metrics(true_values, pred_values):
    mse = round(mean_squared_error(true_values, pred_values),3)
    mae = round(mean_absolute_error(true_values, pred_values),3)
    r_score = round(r2_score(true_values, pred_values),3)

    return {"mse": mse,
            "mae": mae,
            "r^2": r_score,}

In [2]:
import pandas as pd

pKa_test_csv = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\csv_for_rulefit\test_pKa_v5_features_canon_smiles.csv'
pKa_train_csv = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\csv_for_rulefit\train_pKa_v5_features_canon_smiles.csv'

In [3]:
df_pka_train = pd.read_csv(pKa_train_csv, index_col=0)
df_pka_test = pd.read_csv(pKa_test_csv, index_col=0)

In [4]:
df_pka_test.columns

Index(['f_to_fg', 'chirality', 'angle_R2X2R1', 'TASA', 'nC', 'nO', 'nHRing',
       'angle_X2X1R1', 'dipole_moment', 'nF', 'PPSA5', 'angle_R1X1R2', 'RPCS',
       'PBF', 'nFRing', 'avg_atoms_in_cycle', 'FPSA3', 'identificator',
       'nARing', 'PNSA5', 'f_freedom', 'sasa', 'nN', 'GeomShapeIndex',
       'angle_X1X2R2', 'mol_num_cycles',
       'distance_between_atoms_in_cycle_and_f_group',
       'distance_between_atoms_in_f_group_centers', 'mol_weight', 'cis/trans',
       'dihedral_angle', 'nFHRing', 'no angle and distance', 'pKa'],
      dtype='object')

In [5]:
LOGP_FEATURES = ['f_freedom', 'PPSA5', 'mol_num_cycles', 'nFRing', 'nF', 'identificator',
                 'mol_weight', 'dipole_moment', 'nHRing', 'nO', 'PBF', 'nC', 'nARing',
                 'cis/trans', 'PNSA5', 'FPSA3', 'mol_volume', 'RPCS', 'GeomShapeIndex',
                 'WPSA5', 'TASA', 'f_to_fg', 'avg_atoms_in_cycle', 'nFHRing',
                 'chirality']

PKA_FEATURES = ['RPCS', 'angle_R1X1R2', 'angle_X2X1R1', 'angle_X1X2R2', 'angle_R2X2R1', 
                'distance_between_atoms_in_cycle_and_f_group', 
                'distance_between_atoms_in_f_group_centers', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5',
                'avg_atoms_in_cycle', 'nHRing', 'cis/trans', 'FPSA3', 'nF', 'chirality',
                'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 'mol_num_cycles',
                'f_freedom', 'nFRing', 'identificator', 'nO', 'nARing', 'nC', 'nFHRing',
                'f_to_fg']

PKA_FEATURES_NOT_CATEGORICAL = [
    'RPCS', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5',
    'avg_atoms_in_cycle', 'FPSA3', 'chirality',
    'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 
    'f_freedom', 'f_to_fg'
]

PKA_FEATURES_CATEGORICAL = [
    'angle_R1X1R2', 'angle_X2X1R1', 'angle_X1X2R2', 'angle_R2X2R1', 
    'distance_between_atoms_in_cycle_and_f_group', 
    'distance_between_atoms_in_f_group_centers', 'nHRing', 'cis/trans', 'nF', 'mol_num_cycles',
    'nFRing', 'identificator', 'nO', 'nARing', 'nC', 'nFHRing'
]

In [6]:
y_pka_train = df_pka_train[['pKa']]
y_pka_test = df_pka_test[['pKa']]

X_pka_train = df_pka_train[PKA_FEATURES]
X_pka_test = df_pka_test[PKA_FEATURES]

In [7]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X_pka_train, y_pka_train)

y_pred = regr.predict(X_pka_test)

calculate_metrics(y_pred, y_pka_test)

{'mse': 0.527, 'mae': 0.608, 'r^2': 0.891}

MinMaxScaler

In [8]:
X_pka_train_not_categorical = X_pka_train[PKA_FEATURES_NOT_CATEGORICAL]
X_pka_test_not_categorical = X_pka_test[PKA_FEATURES_NOT_CATEGORICAL]

In [9]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler().fit(X_pka_test_not_categorical)

X_pka_train_norm = min_max_scaler.transform(X_pka_train_not_categorical)
X_pka_test_norm = min_max_scaler.transform(X_pka_test_not_categorical)

y_pka_train_norm = y_pka_train.values
y_pka_test_norm = y_pka_test.values

In [10]:
df_train_norm = pd.DataFrame(X_pka_train_norm, columns=PKA_FEATURES_NOT_CATEGORICAL)
df_test_norm = pd.DataFrame(X_pka_test_norm, columns=PKA_FEATURES_NOT_CATEGORICAL)

for feature in PKA_FEATURES_CATEGORICAL:
    df_train_norm[feature] = X_pka_train[feature].values
    df_test_norm[feature] = X_pka_test[feature].values

df_train_norm_ = df_train_norm.copy()
df_test_norm_ = df_test_norm.copy()

df_train_norm['fold_id'] = df_pka_train.copy()['fold_id'].tolist()

df_train_norm['pKa'] = df_pka_train.copy()['pKa'].tolist()
df_test_norm['pKa'] = df_pka_test.copy()['pKa'].tolist()

In [11]:
import os

save_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKA_logP_minmaxscaler'

# df_train_norm.to_csv(os.path.join(save_path, 'train_pka_minmax.csv'))
# df_test_norm.to_csv(os.path.join(save_path, 'test_pka_minmax.csv'))

In [12]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(df_train_norm_, y_pka_train_norm)

y_pred = regr.predict(df_test_norm_)

In [13]:
calculate_metrics(y_pred, y_pka_test_norm)

{'mse': 0.527, 'mae': 0.608, 'r^2': 0.891}

Standartization

In [15]:
X_pka_train_not_categorical = X_pka_train[PKA_FEATURES_NOT_CATEGORICAL]
X_pka_test_not_categorical = X_pka_test[PKA_FEATURES_NOT_CATEGORICAL]

In [16]:
from sklearn.preprocessing import StandardScaler

standart_scaler = StandardScaler().fit(X_pka_test_not_categorical)

X_pka_train_norm = standart_scaler.transform(X_pka_train_not_categorical)
X_pka_test_norm = standart_scaler.transform(X_pka_test_not_categorical)

y_pka_train_norm = y_pka_train.values
y_pka_test_norm = y_pka_test.values

In [17]:
df_train_norm = pd.DataFrame(X_pka_train_norm, columns=PKA_FEATURES_NOT_CATEGORICAL)
df_test_norm = pd.DataFrame(X_pka_test_norm, columns=PKA_FEATURES_NOT_CATEGORICAL)

for feature in PKA_FEATURES_CATEGORICAL:
    df_train_norm[feature] = X_pka_train[feature].values
    df_test_norm[feature] = X_pka_test[feature].values

df_train_norm_ = df_train_norm.copy()
df_test_norm_ = df_test_norm.copy()

df_train_norm['fold_id'] = df_pka_train.copy()['fold_id'].tolist()

df_train_norm['pKa'] = df_pka_train.copy()['pKa'].tolist()
df_test_norm['pKa'] = df_pka_test.copy()['pKa'].tolist()

In [19]:
import os

save_path = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_logP_minmaxscaler_canon_smiles\pKa'

df_train_norm.to_csv(os.path.join(save_path, 'train_pka_standart_scaler_non_categorizal_features_scaled.csv'))
df_test_norm.to_csv(os.path.join(save_path, 'test_pka_standart_scaler_non_categorizal_features_scaled.csv'))

In [20]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X_pka_train, y_pka_train_norm)

y_pred = regr.predict(X_pka_test)

calculate_metrics(y_pred, y_pka_test_norm)

{'mse': 0.527, 'mae': 0.608, 'r^2': 0.891}