In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

### Corneal permeability
Loading dataset

In [3]:
df = pd.read_csv('corneal.csv')
df.head(5)

Unnamed: 0,SMILES,logPerm
0,CC1CC2C3CCC(C3(CC(C2(C4(C1=CC(=O)C=C4)C)F)O)C)...,5.135798
1,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,5.347108
2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)N,5.393628
3,C(C(CO)O)O,3.806662
4,CC12CCC(=O)C=C1CCC3C2C(CC4(C3CCC4(C(=O)CO)O)C)O,4.442651


MACCS fingerprints

In [4]:
smiles = df['SMILES'].to_list()

In [5]:
mols = [Chem.MolFromSmiles(i) for i in smiles]
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
for i in range(len(mols)):
    ds = list(MACCSkeys.GenMACCSKeys(mols[i]).ToBitString())
    MACCS_list.append(ds)
df3 = pd.DataFrame(MACCS_list,columns=header)
df3.insert(loc=0, column='smiles', value=smiles)
df3.head(3)

Unnamed: 0,smiles,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,CC1CC2C3CCC(C3(CC(C2(C4(C1=CC(=O)C=C4)C)F)O)C)...,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,1,1,1,0
1,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)N,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0


In [8]:
X = df3.iloc[:, 1:168].astype(int)
y = df['logPerm'] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10)

In [10]:
from xgboost import XGBRegressor

In [11]:
model = XGBRegressor(random_state=10)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

In [13]:
r2 = r2_score(y_test, y_pred)
r2

0.7401609350384324

In [14]:
mse = mean_squared_error(y_test, y_pred)
mse

0.7236072669855155

## Reduction dimensionality

### Feature importance method

In [15]:
model.fit(X, y)

In [16]:
importance = model.feature_importances_

In [17]:
features = df3.iloc[:, 1:168].columns.values.tolist()

In [18]:
# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

In [19]:
filtered_importance_df = importance_df[importance_df['Importance'] >= 0.005]

In [20]:
filtered_importance_df.index

Index([ 28, 123, 158, 142,  38,  69,  80,  97, 122, 149, 163, 156, 131,  41,
       103, 148, 132, 117,  84, 139, 118, 128, 101, 133, 140,  34,  54,  79,
        86,  46, 121,  31,  98, 115],
      dtype='int64')

In [21]:
X_2 = df3.iloc[:, [28, 123, 158, 142,  38,  69,  80,  97, 122, 149, 163, 156, 131,  41,
       103, 148, 132, 117,  84, 139, 118, 128, 101, 133, 140,  34,  54,  79,
        86,  46, 121,  31,  98, 115]].astype(float)

In [22]:
y = df['logPerm']

In [23]:
X_train2, X_test2, y_train, y_test = train_test_split(X_2, y, test_size=0.2, random_state = 10)

In [24]:
model2 = XGBRegressor(random_state=10)
model2.fit(X_train2,y_train)

In [25]:
y_pred2 = model2.predict(X_test2)

In [26]:
r2_FI = r2_score(y_test, y_pred2)
r2_FI

0.6703044849351889

In [27]:
mse_FI = mean_squared_error(y_test, y_pred2)
mse_FI

0.9181455091393446

Reduction dimensionality leads to decreasing of model efficiency

## PCA

In [28]:
from sklearn.decomposition import PCA

In [29]:
#Perform PCA with specified variance of 95%
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X)

#Print results
print('Original Dimensions: ',X.shape)
print('Reduced Dimensions: ',X_reduced.shape)
print("Explained variance: ", pca.explained_variance_ratio_.sum())

Original Dimensions:  (120, 167)
Reduced Dimensions:  (120, 35)
Explained variance:  0.9512454628486214


In [30]:
X_reduced

array([[-2.85429965, -2.1130067 , -2.91169705, ..., -0.4605446 ,
        -0.25241682, -0.17390966],
       [-1.35900306,  1.86071285, -1.95403989, ..., -0.17655115,
        -0.09181565,  0.01549242],
       [-0.49290216,  2.73976581, -1.20506284, ...,  0.21379918,
         0.07722089,  0.38271824],
       ...,
       [-1.4403942 , -0.38753448,  1.2407486 , ...,  0.37435559,
         0.2612092 ,  0.42218984],
       [ 1.30300545,  0.58008577,  0.59047973, ...,  0.40416616,
        -0.45597279,  0.10387437],
       [-0.0400131 , -0.01646398,  0.97477909, ..., -0.06140715,
         0.23310443,  0.05174859]])

In [31]:
X_pca_train, X_pca_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=10)

In [32]:
model3 = XGBRegressor(random_state=10)
model3.fit(X_pca_train,y_train)

In [33]:
y_pred3 = model3.predict(X_pca_test)

In [34]:
r2_pca = r2_score(y_test, y_pred3)
r2_pca

0.6998481951348655

In [35]:
mse_pca = mean_squared_error(y_test, y_pred3)
mse_pca

0.8358713391743239

PCA method of reduction dimensionality does not lead to increasing of model efficiency

### Unsupervised method
Drop features with high correlation

In [36]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_std = stdsc.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)

In [37]:
cov_mat =np.cov(X_std.T)

In [39]:
# Filter out highly correlated features

FILTER_THRESHOLD = 0.9

cols = X.columns
cov_mat_df = pd.DataFrame(cov_mat, columns=cols)

upper_tri = cov_mat_df.where(
    np.triu(
        np.ones(cov_mat_df.shape), k=1).astype(bool)
        )

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > FILTER_THRESHOLD)]

df_after_FS = X.drop(to_drop, axis=1)

In [40]:
df_after_FS.columns.to_list()

['bit0',
 'bit1',
 'bit2',
 'bit3',
 'bit4',
 'bit5',
 'bit6',
 'bit7',
 'bit8',
 'bit9',
 'bit10',
 'bit12',
 'bit13',
 'bit14',
 'bit15',
 'bit16',
 'bit17',
 'bit18',
 'bit19',
 'bit20',
 'bit21',
 'bit22',
 'bit23',
 'bit24',
 'bit25',
 'bit26',
 'bit27',
 'bit28',
 'bit29',
 'bit30',
 'bit31',
 'bit32',
 'bit34',
 'bit35',
 'bit36',
 'bit37',
 'bit38',
 'bit39',
 'bit40',
 'bit41',
 'bit42',
 'bit43',
 'bit44',
 'bit45',
 'bit46',
 'bit48',
 'bit52',
 'bit53',
 'bit54',
 'bit57',
 'bit62',
 'bit65',
 'bit66',
 'bit68',
 'bit72',
 'bit74',
 'bit75',
 'bit77',
 'bit78',
 'bit79',
 'bit82',
 'bit83',
 'bit84',
 'bit85',
 'bit86',
 'bit87',
 'bit89',
 'bit90',
 'bit91',
 'bit92',
 'bit93',
 'bit95',
 'bit96',
 'bit97',
 'bit98',
 'bit99',
 'bit100',
 'bit101',
 'bit103',
 'bit104',
 'bit108',
 'bit109',
 'bit111',
 'bit112',
 'bit113',
 'bit114',
 'bit115',
 'bit116',
 'bit117',
 'bit118',
 'bit120',
 'bit121',
 'bit122',
 'bit123',
 'bit125',
 'bit126',
 'bit127',
 'bit128',
 'bit129

In [45]:
X_4 = df_after_FS
y = df['logPerm']

In [46]:
X_train4, X_test4, y_train, y_test = train_test_split(X_4, y, test_size=0.2, random_state=10)

In [47]:
model4 = XGBRegressor(random_state=10)
model4.fit(X_train4,y_train)

In [48]:
y_pred4 = model4.predict(X_test4)

In [51]:
r2_4 = r2_score(y_test, y_pred4)
r2_4

0.7700143322020816

Method based on Pearson correlation allows to increase the R2 value ​​from 0.74 to 0.77

# Melanin binding

In [53]:
melanin = pd.read_csv('melanin.csv')
melanin.head(3)

Unnamed: 0,SMILES,Class
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,1
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,1
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,1


### RDKit descriptors

In [54]:
from rdkit.Chem import rdMolDescriptors

In [55]:
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)

In [56]:
def smi_to_descriptors(smile):
        mol = Chem.MolFromSmiles(smile)
        if mol:
            descriptors = np.array(get_descriptors.ComputeProperties(mol))
            return descriptors
        else:
            return np.zeros(len(descriptor_names))

In [57]:
melanin['DS'] = melanin['SMILES'].apply(smi_to_descriptors)

In [58]:
melanin.head(3)

Unnamed: 0,SMILES,Class,DS
0,CCN(CC)CCNC(=O)c1ccc(cc1)N.Cl,1,"[271.145140004, 271.792, 4.0, 3.0, 6.0, 2.0, 3..."
1,COCCNC(=O)CN1C2CCC1CC(C2)(c3cccnc3)O,1,"[319.18959166, 319.405, 6.0, 2.0, 6.0, 2.0, 5...."
2,CC1=NN=C(c2cc3c(cc2C1)OCO3)c4ccc(cc4)N,1,"[293.11642672000005, 293.32599999999996, 5.0, ..."


In [69]:
X_mel = np.stack(melanin.DS.apply(lambda x: np.array(x)).to_numpy()).astype(np.float64)
y_mel = melanin['Class']

In [60]:
X_mel_train, X_mel_test, y_mel_train, y_mel_test = train_test_split(X_mel, y_mel, test_size=0.2, random_state=10)

In [61]:
from sklearn.ensemble import GradientBoostingClassifier

In [62]:
model = GradientBoostingClassifier(random_state=10).fit(X_mel_train, y_mel_train)

In [70]:
y_pred_mel = model.predict(X_mel_test)

In [71]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [72]:
f1 = f1_score(y_mel_test, y_pred_mel)
f1

0.875968992248062

In [73]:
acc = accuracy_score(y_mel_test, y_pred_mel)
acc

0.7948717948717948

## Feature importance

In [75]:
importance = model.feature_importances_

In [77]:
features = pd.DataFrame(X_mel).columns.values.tolist()

In [78]:
# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

In [100]:
filtered_importance_df = importance_df[importance_df['Importance'] >= 0.01]

In [101]:
filtered_importance_df.index

Index([12, 27, 26, 11, 37, 13, 38, 42, 40, 25, 16, 17, 32, 29,  3, 24,  6, 41,
       18, 30, 35,  0,  4, 39,  1, 34,  5, 31, 28],
      dtype='int64')

In [102]:
X_mel_FI = pd.DataFrame(X_mel).iloc[:, [12, 27, 26, 11, 37, 13, 38, 42, 40, 25, 16, 17, 32, 29,  3, 24,  6, 41,
       18, 30, 35,  0,  4, 39,  1, 34,  5, 31, 28]].astype(float)

In [103]:
X_mel_FI_train, X_mel_FI_test, y_mel_train, y_mel_test = train_test_split(X_mel_FI, y_mel, test_size=0.2, random_state=10)

In [104]:
X_mel_FI_train.shape

(624, 29)

In [105]:
model_FI = GradientBoostingClassifier(random_state=10)
model_FI.fit(X_mel_FI_train, y_mel_train)

In [106]:
y_pred_mel = model_FI.predict(X_mel_FI_test)

In [107]:
f1_FI = f1_score(y_mel_test, y_pred_mel)
f1_FI

0.8725868725868726

Feature importance method did not lead to a significant improvement in the efficiency of the model

## Statistical method for melanin binding

In [108]:
stdsc = StandardScaler()
X_std_mel = stdsc.fit_transform(pd.DataFrame(X_mel))
X_std_mel = pd.DataFrame(X_std_mel, columns=pd.DataFrame(X_mel).columns)

In [109]:
cov_mat =np.cov(X_std_mel.T)

In [111]:
# Filter out highly correlated features

FILTER_THRESHOLD = 0.9

cols = pd.DataFrame(X_mel).columns
cov_mat_df = pd.DataFrame(cov_mat, columns=cols)

upper_tri = cov_mat_df.where(
    np.triu(
        np.ones(cov_mat_df.shape), k=1).astype(bool)
        )

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > FILTER_THRESHOLD)]

df_after_FS = pd.DataFrame(X_mel).drop(to_drop, axis=1)

In [112]:
df_after_FS

Unnamed: 0,0,2,3,4,6,8,9,10,11,12,...,17,18,19,20,21,22,23,26,30,38
0,271.145140,4.0,3.0,6.0,3.0,40.0,5.0,1.0,0.461538,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.76220,2.780888,-1.26
1,319.189592,6.0,2.0,6.0,5.0,48.0,6.0,1.0,0.647059,3.0,...,1.0,2.0,2.0,0.0,2.0,2.0,2.0,0.65860,5.071192,-1.50
2,293.116427,5.0,2.0,1.0,5.0,37.0,5.0,0.0,0.176471,4.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.76690,3.804652,-2.82
3,257.177964,2.0,1.0,2.0,2.0,42.0,2.0,0.0,0.529412,3.0,...,0.0,1.0,1.0,0.0,2.0,3.0,3.0,3.10240,5.470921,-1.28
4,282.089209,4.0,0.0,3.0,4.0,35.0,4.0,0.0,0.117647,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.47720,3.382798,-2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,288.112344,6.0,3.0,2.0,5.0,34.0,6.0,0.0,0.000000,4.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66410,3.311088,-3.28
776,507.032555,9.0,2.0,7.0,7.0,51.0,12.0,1.0,0.190476,4.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.43590,7.874713,-3.15
777,339.169525,7.0,1.0,7.0,7.0,46.0,7.0,0.0,0.277778,3.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.61100,3.791787,-2.93
778,289.136117,5.0,1.0,3.0,6.0,39.0,6.0,0.0,0.500000,3.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2.79432,4.506615,-1.53


In [113]:
X_mel_SM = df_after_FS

In [114]:
X_mel_SM_train, X_mel_SM_test, y_mel_train, y_mel_test = train_test_split(X_mel_SM, y_mel, test_size=0.2, random_state=10)

In [115]:
model_SM = GradientBoostingClassifier(random_state=10)
model_SM.fit(X_mel_SM_train, y_mel_train)

In [116]:
y_pred_mel = model_SM.predict(X_mel_SM_test)

In [117]:
f1_SM = f1_score(y_mel_test, y_pred_mel)
f1_SM

0.8715953307392996

Statistical method did not lead to a significant improvement in the efficiency of the model

## PCA

In [118]:
#Perform PCA with specified variance of 95%
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_mel)

#Print results
print('Original Dimensions: ',X_mel.shape)
print('Reduced Dimensions: ',X_reduced.shape)
print("Explained variance: ", pca.explained_variance_ratio_.sum())

Original Dimensions:  (780, 43)
Reduced Dimensions:  (780, 2)
Explained variance:  0.9913493061293999


In [119]:
X_pca_train, X_pca_test, y_train, y_test = train_test_split(X_reduced, y_mel, test_size=0.2, random_state=10)

In [120]:
model_pca = GradientBoostingClassifier(random_state=10)
model_pca.fit(X_pca_train, y_train)

In [121]:
y_pred = model_pca.predict(X_pca_test)

In [122]:
f1_pca = f1_score(y_test, y_pred)
f1

0.875968992248062

PCA method did not lead to a significant improvement in the efficiency of the model