In [1]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_07.05_v5_canonical_smiles.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index_canon_smiles.pkl'

dataPreparation = DataPreparation(CSV_PATH)

outliers_features_to_skip = ['dipole_moment', 'distance_between_atoms_in_f_group_centers']
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=outliers_features_to_skip,
                                           is_convert_angles_to_category=True)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
ring_features_to_remain = ['nFRing', 'nHRing', 'nARing', 'nFHRing']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)
    elif "ring" in feature_name.lower() and feature_name not in ring_features_to_remain:
        features_to_drop.append(feature_name)
    elif len(X[feature_name].unique()) == 1:
        print(f"feature without unique values: {feature_name}")
        features_to_drop.append(feature_name)
    # elif "angle" in feature_name or "distance" in feature_name:
    #     features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_column_name='Smiles',
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(len(train_df), len(test_df))


True
174
['f_to_fg', 'chirality', 'tpsa+f', 'angle_R2X2R1', 'nFARing', 'nFAHRing', 'TASA', 'nC', 'nO', 'nHRing', 'angle_X2X1R1', 'dipole_moment', 'nF', 'PPSA5', 'angle_R1X1R2', 'RPCS', 'f_atom_fraction', 'PBF', 'nFRing', 'avg_atoms_in_cycle', 'FPSA3', 'identificator', 'nARing', 'PNSA5', 'f_freedom', 'sasa', 'nN', 'GeomShapeIndex', 'angle_X1X2R2', 'mol_num_cycles', 'naRing', 'distance_between_atoms_in_cycle_and_f_group', 'distance_between_atoms_in_f_group_centers', 'mol_weight', 'nAHRing', 'cis/trans', 'dihedral_angle', 'nFHRing', 'pKa', 'logP']
f_atom_fraction outliers indexes: [117, 118, 130]
PBF outliers indexes: [39, 120]
PNSA5 outliers indexes: [36, 37]
sasa outliers indexes: [120]
mol_weight outliers indexes: [120]
logP outliers indexes: [81, 82]
Remains rows:159, amount of features: 40
133 26


In [2]:
import pickle

smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index_canon_smiles.pkl'

with open(smiles_filepath, 'rb') as handle:
    smiles_to_index = pickle.load(handle)

def smiles_by_index(index, smiles_to_index):
    for smiles, index_ in smiles_to_index.items():
        if index_ == index:
            return smiles

test acid

In [31]:
test_df_pKa_acid = test_df[test_df['identificator'] == 0]

all_features = test_df_pKa_acid.columns.drop('pKa')

test_df_acid_for_gnn = test_df_pKa_acid.drop(all_features, axis=1)

Smiles = [smiles_by_index(index, smiles_to_index) for index in test_df_acid_for_gnn.index]
test_df_acid_for_gnn['Smiles'] = Smiles

In [33]:
test_df_acid_for_gnn.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\test_acid.csv')

train acid

In [9]:
train_df_pKa_acid = train_df[train_df['identificator'] == 0]

all_features = train_df_pKa_acid.columns.drop(['pKa', 'fold_id'])

train_df_acid_for_gnn = train_df_pKa_acid.drop(all_features, axis=1)

Smiles = [smiles_by_index(index, smiles_to_index) for index in train_df_acid_for_gnn.index]
train_df_acid_for_gnn['Smiles'] = Smiles

In [11]:
train_df_acid_for_gnn.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_acid.csv')

test amine

In [38]:
test_df_pKa_amine = test_df[test_df['identificator'] != 0]

all_features = test_df_pKa_amine.columns.drop('pKa')

test_df_amine_for_gnn = test_df_pKa_amine.drop(all_features, axis=1)

Smiles = [smiles_by_index(index, smiles_to_index) for index in test_df_amine_for_gnn.index]
test_df_amine_for_gnn['Smiles'] = Smiles

In [41]:
test_df_amine_for_gnn.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\test_basic.csv')

train amine

In [12]:
train_df_pKa_amine = train_df[train_df['identificator'] != 0]

all_features = train_df_pKa_amine.columns.drop(['pKa', 'fold_id'])

train_df_amine_for_gnn = train_df_pKa_amine.drop(all_features, axis=1)

Smiles = [smiles_by_index(index, smiles_to_index) for index in train_df_amine_for_gnn.index]
train_df_amine_for_gnn['Smiles'] = Smiles

In [17]:
train_df_amine_for_gnn.to_csv(r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\pKa_basicity_data\gnn_cv_canon_smiles\train_basic.csv')