In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Fragments as f
import rdkit.Chem.Lipinski as l
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
m = Chem.MolFromSmiles('Cc1ccccc1')
m.GetNumAtoms()
# rdMolDescriptors
d.CalcExactMolWt(m)
# Fragments
f.fr_Al_COO(m)
# Lipinski 
l.HeavyAtomCount(m)
# Fingerprints
fp = AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=124)
np.array(fp)

In [None]:
data = pd.read_csv('training_smiles.csv')
data.head()

In [None]:
df = data.copy()
df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
# df['molwt'] = df['mol'].apply(d.CalcExactMolWt)
# df['fr_Al_COO'] = df['mol'].apply(f.fr_Al_COO)
# df['HeavyAtomCount'] = df['mol'].apply(l.HeavyAtomCount)
# df['fp'] = df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124))

### FEATURES WITHIN EACH ATOM
https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2523-5/tables/1

In [None]:
# df['most_common_atom'] = df['mol'].apply(lambda x: x.Chem.GetAtomWithIdx(x.GetMostCommonAtomIdx()).GetSymbol())
df['number_atoms'] = df['mol'].apply(Chem.GetNumAtoms)
df['number_hydrogen'] = df['mol'].GetNumHeavyAtoms()
df['unsaturation'] = df['mol'].GetNumHeteroatoms()
df['formal_charge'] = df['mol'].GetFormalCharge()
df['total_valence'] = df['mol'].GetTotalValence()
df['ring'] = df['mol'].IsInRing()
df['aromatic_atoms'] = df['mol'].GetAromaticAtoms()
df['chirality_atoms'] = df['mol'].GetChiralAtoms()
df['hybridization_atoms'] = df['mol'].GetHybridizationAtoms()

df.to_csv('training_processed_2.csv', index=False)

### FEATURES WITHIN EACH SMILE

https://aip.scitation.org/doi/pdf/10.1063/1.5062773 --> 11 features extracted out of the number of elements of a certain type in the bond

In [None]:
df['smiles_length'] = df['SMILES'].apply(len)
df['number_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms()) # number of atoms in the molecule
df['number_bonds'] = df['mol'].apply(lambda x: x.GetNumBonds())
df['number_carbon_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 6]))
df['number_nitrogen_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 7]))
df['number_potasium_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 19]))
df['number_sulfur_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 16]))
df['number_clorine_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 17]))
df['number_bromine_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 35]))
df['number_iodine_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 53]))
df['number_oxygen_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 8]))

# --- not included in paper
df['number_hydrogen_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 1]))
df['number_fluorine_atoms'] = df['mol'].apply(lambda x: len([atom for atom in x.GetAtoms() if atom.GetAtomicNum() == 9]))


df.to_csv('training_preprocessed.csv', index=False)

###  DIVIDE TRAINING AND VALIDATION

In [7]:
from sklearn.model_selection import train_test_split

# get the features and labels
data = pd.read_csv('training_preprocessed.csv')
features = data.drop(['SMILES', 'mol', 'ACTIVE', 'INDEX'], axis=1)
labels = data['ACTIVE']

features = features.loc[:, (features != 0).any(axis=0)] # drop columns with all zeros
features = features.dropna(axis=1, how='all') # drop columns with all Nan

data_train, data_valid, labels_train, labels_valid = train_test_split(features, labels, test_size=0.25, random_state=20) # change random_state to get different results


### APPLY MINMAX SCALER; STANDART SCALER AND PCA

After dividing the training data into training and validation, we applied a minmax scaler, followed by a standart scaler and pca into the training badge. After we used those scalers and pca to apply the same transformation used for training into the validation set. 
The scalers and pca were fitted only with the training data, and then applied also to validation.

In [8]:
# Add functions from previous assignments
import functions as f
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing


mm = preprocessing.MinMaxScaler()
ss = StandardScaler()
pca = PCA(n_components=0.95) #scikit-learn choose the minimum number of principal components such that 95% of the variance is retained


def pre_process(df, labels, mm, ss, pca, train=True):
    features_processed = df.loc[:, (df != 0).any(axis=0)] # drop columns with all zeros
    features_processed = features_processed.dropna(axis=1, how='all') # drop columns with all Nan
    # save index from features_processed
    index = features_processed.index

    if train==True:
        features_processed = mm.fit_transform(features_processed)
        features_processed = ss.fit_transform(features_processed)
        features_processed = pca.fit_transform(features_processed)
    else :
        features_processed = mm.transform(features_processed)
        features_processed = ss.transform(features_processed)
        features_processed = pca.transform(features_processed)

    features_processed = pd.DataFrame(features_processed, index)
    features_processed.columns = ['feature_' + str(i) for i in range(1, len(features_processed.columns)+1)]

    return features_processed, labels

features_processed_train, labels_train = pre_process(data_train, labels_train, mm, ss, pca, train=True)
features_processed_valid, labels_valid = pre_process(data_valid, labels_valid, mm, ss, pca, train=False) #apply pca to data_valid


Saved the training processed data with its corresponding labels.
Saved the validation processed data (with the processing tools fitted to the training set) and saved with the corresponding labels, which should be used as ground truth.

In [9]:
training_processed = features_processed_train
training_processed['ACTIVE'] = labels_train
training_processed.to_csv('training_processed.csv', index=False)

validation_processed = features_processed_valid
validation_processed['ACTIVE'] = labels_valid
validation_processed.to_csv('validation_processed.csv', index=False)


# TOTAL PROCESSED WITH TRAINING AND VALIDATION PCA applied only on training data, then applied on validation data
validation_true = validation_processed.copy()
validation_true['ACTIVE'] = labels_valid
total_processed = training_processed.append(validation_true)
total_processed.to_csv('total_processed.csv', index=False)


### (only to check) LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score   

data_train = pd.read_csv('training_processed.csv')
data_valid = pd.read_csv('validation_processed.csv')

labels_train = data_train['ACTIVE']# ground truth labels
features_train = data_train.drop(['ACTIVE'], axis=1) 
labels_valid = data_valid['ACTIVE']# ground truth labels
features_valid = data_valid.drop(['ACTIVE'], axis=1)

logisticRegr = LogisticRegression(solver = 'lbfgs')

# Train 
logisticRegr.fit(features_train, labels_train)

# Predict
log_labels = logisticRegr.predict(features_valid)# predicted labels
log_score = logisticRegr.score(features_valid, labels_valid)
log_pred = logisticRegr.predict_proba(features_valid)

log_auc = roc_auc_score(labels_valid, log_pred[:,1])

# print the results
print('Logistic Regression Accuracy: ', log_score)
print('Logistic Regression AUC: ', log_auc)

In [3]:
from sklearn.metrics import roc_auc_score   

data_train = pd.read_csv('training_processed.csv')
data_valid = pd.read_csv('validation_processed.csv')

labels_train = data_train['ACTIVE']# ground truth labels
features_train = data_train.drop(['ACTIVE'], axis=1) 
labels_valid = data_valid['ACTIVE']# ground truth labels
features_valid = data_valid.drop(['ACTIVE'], axis=1)



In [4]:
#logistic regression
from sklearn.linear_model import LogisticRegression


logisticRegr = LogisticRegression(solver = 'lbfgs')

# Train 
logisticRegr.fit(features_train, labels_train)

# Predict
log_labels = logisticRegr.predict(features_valid)# predicted labels
log_score = logisticRegr.score(features_valid, labels_valid)
log_pred = logisticRegr.predict_proba(features_valid)

log_auc = roc_auc_score(labels_valid, log_pred[:,1])

# print the results
print('Logistic Regression Accuracy: ', log_score)
print('Logistic Regression AUC: ', log_auc)

Logistic Regression Accuracy:  0.9886855241264559
Logistic Regression AUC:  0.7578277337651416


In [None]:
#svm
from sklearn import svm

svm_clf = svm.SVC(probability=True)

# Train
svm_clf.fit(features_train, labels_train)

# Predict
svm_labels = svm_clf.predict(features_valid)# predicted labels
svm_score = svm_clf.score(features_valid, labels_valid)
svm_pred = svm_clf.predict_proba(features_valid)

svm_auc = roc_auc_score(labels_valid, svm_pred[:,1])

# print the results
print('SVM Accuracy: ', svm_score)
print('SVM AUC: ', svm_auc)

svm_clf = svm.SVC(probability=True)
SVM Accuracy:  0.9887879175732753
SVM AUC:  0.5164718458815745

In [5]:
# naive bayes

from sklearn.naive_bayes import GaussianNB

naive_bayes = GaussianNB()

# Train 
naive_bayes.fit(features_train, labels_train)

# Predict
naive_bayes_labels = naive_bayes.predict(features_valid)# predicted labels
naive_bayes_score = naive_bayes.score(features_valid, labels_valid)
naive_bayes_pred = naive_bayes.predict_proba(features_valid)


naive_bayes_auc = roc_auc_score(labels_valid, naive_bayes_pred[:,1])

# print the results

print('Naive Bayes Accuracy: ', naive_bayes_score)
print('Naive Bayes AUC: ', naive_bayes_auc)

Naive Bayes Accuracy:  0.9685396134647383
Naive Bayes AUC:  0.737215835375757


naive_bayes = GaussianNB()
Naive Bayes Accuracy:  0.9685396134647383
Naive Bayes AUC:  0.737215835375757

In [None]:
#knn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(features_train, labels_train)
knn_labels = knn.predict(features_valid)# predicted labels
knn_score = knn.score(features_valid, labels_valid)
knn_pred = knn.predict_proba(features_valid)

knn_auc = roc_auc_score(labels_valid, knn_pred[:,1])

# print the results
print('KNN Accuracy: ', knn_score)
print('KNN AUC: ', knn_auc)




neighbors = 3
KNN Accuracy:  0.9879175732753104
KNN AUC:  0.5483442627078582

neighbors = 5
KNN Accuracy:  0.9886343274030462
KNN AUC:  0.5604776061602166

neighbors = 7
KNN Accuracy:  0.9887623192115704
KNN AUC:  0.5723251994576865

In [None]:
#ann
from sklearn.neural_network import MLPClassifier

ann = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=100, alpha=0.0001)
ann.fit(features_train, labels_train)
ann_labels = ann.predict(features_valid)# predicted labels
ann_score = ann.score(features_valid, labels_valid)
ann_pred = ann.predict_proba(features_valid)

ann_auc = roc_auc_score(labels_valid, ann_pred[:,1])

# print the results
print('ANN Accuracy: ', ann_score)
print('ANN AUC: ', ann_auc)



hidden_layer_sizes=(100, 100, 100), max_iter=100, alpha=0.0001)
ANN Accuracy:  0.9881223601689492
ANN AUC:  0.7085456584949629



In [None]:
#decision tree
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(features_train, labels_train)
dec_tree_labels = dec_tree.predict(features_valid)# predicted labels
dec_tree_score = dec_tree.score(features_valid, labels_valid)
dec_tree_pred = dec_tree.predict_proba(features_valid)

dec_tree_auc = roc_auc_score(labels_valid, dec_tree_pred[:,1])

# print the results 
print('Decision Tree Accuracy: ', dec_tree_score)
print('Decision Tree AUC: ', dec_tree_auc)



dec_tree = DecisionTreeClassifier()
Decision Tree Accuracy:  0.9817227697427364
Decision Tree AUC:  0.5279052211450267

In [6]:
#xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(features_train, labels_train)
xgb_labels = xgb.predict(features_valid)# predicted labels
xgb_score = xgb.score(features_valid, labels_valid)
xgb_pred = xgb.predict_proba(features_valid)

xgb_auc = roc_auc_score(labels_valid, xgb_pred[:,1])

# print the results
print('XGBoost Accuracy: ', xgb_score)
print('XGBoost AUC: ', xgb_auc)


XGBoost Accuracy:  0.9886855241264559
XGBoost AUC:  0.7504152523969736


xgb = XGBClassifier()
XGBoost Accuracy:  0.9886855241264559
XGBoost AUC:  0.7504152523969736

In [7]:
#random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(features_train, labels_train)
rf_labels = rf.predict(features_valid)# predicted labels
rf_score = rf.score(features_valid, labels_valid)
rf_pred = rf.predict_proba(features_valid)

rf_auc = roc_auc_score(labels_valid, rf_pred[:,1])

# print the results
print('Random Forest Accuracy: ', rf_score)
print('Random Forest AUC: ', rf_auc)


Random Forest Accuracy:  0.98712402406246
Random Forest AUC:  0.679924037566644


n_estimators = 100
Random Forest Accuracy:  0.9869960322539357
Random Forest AUC:  0.6720564956043119