In [3]:
import os
import sys
sys.path.insert(0, os.path.dirname('C:\work\DrugDiscovery\main_git\XAI_Chem\ml_part'))

import pandas as pd

from ml_part.random_forest.data_prep.preparation import DataPreparation
from ml_part.random_forest.train import RFTrain

CSV_PATH = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\remained_features_pKa_01.02_v2.csv'
smiles_filepath = r'C:\work\DrugDiscovery\main_git\XAI_Chem\data\updated_features\smiles_to_index.pkl'

dataPreparation = DataPreparation(CSV_PATH)

unimportant_features_to_drop = ['logP']
X, y = dataPreparation.prepare_data_for_RF(is_pKa=True,
                                           molecule_type="amine",
                                           use_mandatory_features=True,
                                           is_remove_outliers=True,
                                           is_remove_nan=False,
                                           outliers_features_to_skip=unimportant_features_to_drop,)

correlated_features = ['f_atom_fraction', 'naHRing', 'nFaRing', 'nFaHRing', 'tpsa+f']
features_to_drop = []
for feature_name in X.columns:
    if feature_name in correlated_features:
        features_to_drop.append(feature_name)

X = X.drop(features_to_drop, axis=1)

rf_train = RFTrain(X=X, 
                   y=y,
                   smiles_filepath=smiles_filepath,
                   is_pKa=True,
                   k_folds=2)

y_train = rf_train.y_train
X_train = rf_train.X_train

y_test = rf_train.y_test
X_test = rf_train.X_test

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(len(train_df), len(test_df))

True
128
['avg_atoms_in_cycle', 'nFaHRing', 'chirality', 'PPSA5', 'tpsa+f', 'RPCS', 'mol_num_cycles', 'GeomShapeIndex', 'angle_R2X2R1', 'nN', 'distance_between_atoms_in_f_group_centers', 'nC', 'nFARing', 'angle_R1X1R2', 'f_freedom', 'naHRing', 'nFAHRing', 'cis/trans', 'dipole_moment', 'f_to_fg', 'identificator', 'f_atom_fraction', 'nFRing', 'nFaRing', 'naRing', 'nFHRing', 'PBF', 'nARing', 'nF', 'dihedral_angle', 'nAHRing', 'nO', 'TASA', 'angle_X2X1R1', 'mol_volume', 'FPSA3', 'PNSA5', 'angle_X1X2R2', 'nHRing', 'pKa', 'logP']
dipole_moment outliers indexes: [27]
f_atom_fraction outliers indexes: [69]
mol_volume outliers indexes: [72]
Remains rows:121, amount of features: 41
102 19


In [9]:
from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import StandardScaler
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

merged_dataframe = pd.concat([train_df, test_df], axis=0)

# merged_dataframe.rename(columns={"amine_type": "identificator"}, inplace=True)

features_to_analyse = []
for feature_name in merged_dataframe.columns:
    if len(merged_dataframe[feature_name].unique()) > 1:
        features_to_analyse.append(feature_name)

features_to_drop = ['nFARing', 'nFAHRing', 'nFHRing', 'nAHRing', 'fold_id']

for feature_to_drop in features_to_drop:
    features_to_analyse.remove(feature_to_drop)

fa = FactorAnalyzer(rotation=None)


df_features = merged_dataframe[features_to_analyse]
df_features_darray = StandardScaler().fit_transform(df_features)
fa.fit(df_features_darray)

ev, _ = fa.get_eigenvalues()
for index, en_value in enumerate(ev):
    if en_value < 1:
        break
index += 1

n_factors = 7

fa = FactorAnalyzer(n_factors=n_factors, rotation="varimax")
fa.fit(df_features)

fa_load = pd.DataFrame(fa.loadings_,index=df_features.columns)
fa_load.style.background_gradient(cmap="coolwarm")

Unnamed: 0,0,1,2,3,4,5,6
avg_atoms_in_cycle,0.559514,-0.128207,0.009668,0.313833,-0.016495,0.492427,-0.194931
chirality,0.069613,0.029592,0.043625,-0.074205,0.880531,-0.0586,0.02037
PPSA5,0.181451,0.873914,-0.037794,-0.017925,-0.003019,0.270724,0.094578
RPCS,-0.25868,-0.692308,-0.044495,0.122418,0.146268,0.096657,0.188542
mol_num_cycles,0.10073,-0.138818,0.895782,0.218132,0.068037,-0.014289,-0.141784
GeomShapeIndex,-0.090977,0.280549,-0.059909,-0.342687,-0.020933,-0.177356,0.001055
angle_R2X2R1,0.945721,0.30417,0.024254,0.004577,0.054207,0.050896,0.0082
distance_between_atoms_in_f_group_centers,0.726726,0.281742,0.093868,0.11345,0.022852,0.044012,-0.044925
nC,0.168151,0.015929,0.766467,0.041017,-0.102299,0.5948,-0.120563
angle_R1X1R2,0.928906,0.308957,0.003559,0.147516,0.088919,-0.002026,0.020943


In [20]:
import pandas as pd
from factor_analyzer import ConfirmatoryFactorAnalyzer, ModelSpecificationParser

features = ["angle_R2X2R1", "angle_R1X1R2", "angle_X2X1R1", "angle_X1X2R2", "cis/trans", "chirality", "pKa"]

model_dict = {
    "F1": ["angle_R2X2R1", "angle_R1X1R2", "angle_X2X1R1", "angle_X1X2R2", "pKa"],
    "F2": ["cis/trans", "chirality", "pKa"]
}

df_features_temp = df_features[features]

model_spec = ModelSpecificationParser.parse_model_specification_from_dict(df_features_temp, model_dict)

cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)

cfa.fit(df_features_temp)

print(cfa.loadings_)


[[ 2.83008973e+02  0.00000000e+00]
 [ 2.69448315e+02  0.00000000e+00]
 [ 2.94899535e+02  0.00000000e+00]
 [ 2.83366194e+02  0.00000000e+00]
 [ 3.81433366e-01 -1.67051589e+02]
 [ 0.00000000e+00 -2.74041755e+03]
 [ 0.00000000e+00 -2.41265654e+03]]


