In [37]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

from hyperopt import space_eval

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_08.02_v4_fixed_distances_chirality.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = ['dipole_moment']
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
ring_features_to_remain = ['nFRing', 'nHRing', 'nARing', 'nFHRing']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)
    elif "ring" in feature_name.lower() and feature_name not in ring_features_to_remain:
        features_to_drop.append(feature_name)
    elif len(X[feature_name].unique()) == 1:
        print(f"feature without unique values: {feature_name}")
        features_to_drop.append(feature_name)
    elif "angle" in feature_name or "distance" in feature_name:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)

True
183
['RPCS', 'PBF', 'mol_weight', 'dipole_moment', 'PPSA5', 'avg_atoms_in_cycle', 'nHRing', 'cis/trans', 'f_atom_fraction', 'dihedral_angle', 'FPSA3', 'distance_between_atoms_in_cycle_and_f_group', 'angle_X1X2R2', 'nF', 'angle_R1X1R2', 'nFAHRing', 'nAHRing', 'chirality', 'sasa', 'PNSA5', 'GeomShapeIndex', 'TASA', 'angle_R2X2R1', 'mol_num_cycles', 'naRing', 'nN', 'f_freedom', 'tpsa+f', 'nFRing', 'identificator', 'nO', 'distance_between_atoms_in_f_group_centers', 'angle_X2X1R1', 'nARing', 'nFARing', 'nC', 'nFHRing', 'f_to_fg', 'pKa', 'logP']
PBF outliers indexes: [40, 71, 127]
f_atom_fraction outliers indexes: [124]
FPSA3 outliers indexes: [40]
sasa outliers indexes: [127]
PNSA5 outliers indexes: [37, 38]
distance_between_atoms_in_f_group_centers outliers indexes: [35]
logP outliers indexes: [82, 83]
Remains rows:169, amount of features: 40


In [32]:
X_train = rf_train.X_train
y_train = rf_train.y_train

X_y = pd.concat([X_train, y_train], axis=1)
X_y

Unnamed: 0,RPCS,PBF,mol_weight,dipole_moment,PPSA5,avg_atoms_in_cycle,nHRing,cis/trans,FPSA3,nF,...,nFRing,identificator,nO,nARing,nC,nFHRing,f_to_fg,bin,fold_id,pKa
0,20.923542,0.599068,164.152,0.364516,14.226840,6.0,0,0,0.043079,2,...,0,0,2,1,7,0,2.0,1,0,4.32
1,16.720730,0.580702,128.171,0.253413,13.888098,6.0,0,0,0.046453,0,...,0,0,2,1,7,0,0.0,2,1,4.85
2,15.197764,0.812504,126.155,0.244237,14.422664,3.0,0,2,0.047023,0,...,1,0,2,2,7,0,0.0,2,1,4.35
3,17.778516,0.631131,126.155,0.246117,14.472009,3.0,0,1,0.047613,0,...,1,0,2,2,7,0,0.0,1,0,4.24
4,20.595646,0.796492,162.136,0.596781,14.920075,3.0,0,1,0.045186,2,...,1,0,2,2,7,0,2.0,0,0,3.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,4.833689,0.696740,107.104,0.528761,11.743145,3.0,0,1,0.042360,2,...,0,1,0,1,4,0,2.0,4,1,7.16
176,98.958912,0.552567,71.123,0.150657,8.670753,3.0,0,2,0.041147,0,...,0,1,0,1,4,0,0.0,7,1,9.44
177,78.177540,0.678788,71.123,0.145194,7.931815,3.0,0,1,0.035464,0,...,0,1,0,1,4,0,0.0,6,1,9.15
180,71.616824,0.603329,83.134,0.177919,8.165825,2.5,0,0,0.033184,0,...,0,1,0,2,5,0,0.0,6,0,9.41


In [46]:
X_train = rf_train.X_train
y_train = rf_train.y_train
for bin in sorted(X_train['bin'].unique()):
    rows_with_same_bin = X_train[X_train['bin'] == bin]

    y_values = y_train.loc[rows_with_same_bin.index]
    print(f"Bin number: {bin}")
    print(rows_with_same_bin['fold_id'].values)
    print(y_values.values)
    print("-"*30)

Bin number: 0
[0 1 1 0 1 1 0 1 0 0 0 1 0 0 1]
[3.8  3.76 3.66 3.08 2.9  3.83 3.25 3.62 3.76 3.81 3.78 3.59 3.87 3.76
 3.85]
------------------------------
Bin number: 1
[0 0 1 0 0 1 1 0 1 0 1 1 1 0]
[4.32 4.24 3.88 4.11 3.92 4.18 4.12 3.98 4.09 4.17 4.26 3.88 4.18 4.17]
------------------------------
Bin number: 2
[1 1 0 1 0 0 0 1 1 0 0 1 1 0]
[4.85 4.35 4.5  4.36 4.38 4.6  4.5  4.69 4.75 4.59 4.38 4.35 4.79 4.83]
------------------------------
Bin number: 3
[0 1 0 0 0 1 0 0 1 1 1 0 1 1 0]
[5.08 6.52 6.06 6.23 6.56 6.41 5.96 6.37 6.44 6.22 5.   6.62 6.41 6.06
 5.84]
------------------------------
Bin number: 4
[0 1 0 1 1 0 0 1 1 0 0 1 1 0]
[7.72 7.69 7.75 6.88 7.07 6.74 7.01 7.02 7.2  7.45 7.18 6.92 7.16 7.44]
------------------------------
Bin number: 5
[0 0 0 0 0 1 0 1 1 0 1 1 1 1]
[8.05 8.55 8.64 8.74 8.6  7.99 7.78 7.83 8.67 8.37 8.83 8.13 7.95 7.76]
------------------------------
Bin number: 6
[1 1 0 1 0 1 0 0 0 0 1 1 0 1 0]
[8.89 8.95 9.41 8.87 9.05 9.17 9.04 8.96 8.88 9.02 9.35 

--------

In [41]:

from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
	 

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.