In [178]:
# Now using cross validation
import numpy as np
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Pre-proccess
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from pathlib import Path

In [97]:
# Importing dataset
HERE = Path(_dh[-1])
DATA = HERE.parent/'data'/'fda_approved'/'fda_rdkit_pre_processed_descriptors.csv'

drugs_and_descriptors = pd.read_csv(DATA)
print(drugs_and_descriptors.columns)

Index(['name', 'chembl_id', 'clean_smiles', 'first_approval_year',
       'indication_class', 'molecule_type', 'withdrawn_flag',
       'therapeutic_flag', 'polymer_flag', 'inorganic_flag',
       ...
       'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone',
       'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=215)


In [118]:
not_descriptors = [
    'name',
    'chembl_id',
    'clean_smiles',
    'first_approval_year',
    'indication_class',
    'molecule_type',
    'withdrawn_flag',
    'therapeutic_flag',
    'polymer_flag',
    'inorganic_flag',
    'natural_product_flag',
    'parenteral',
    'topical',
    'oral'
]
X = drugs_and_descriptors.drop(columns=not_descriptors)
y = drugs_and_descriptors['oral']

### Cross-validation

Comparing the models using Logistic Regression, KNN, SVM and Random Forest:

In [189]:
def compare_models(models, X, y):
    scores = {}
    for model in models:
        roc_auc = cross_val_score(model, X, y, cv=10, scoring='roc_auc' )
        acc = cross_val_score(model, X, y, cv=10, scoring='accuracy')
        balanced_acc = cross_val_score(model, X, y, cv=10, scoring='balanced_accuracy')

        scores[f"{str(model)}_roc"] = (roc_auc)
        scores[f"{str(model)}_acc"] = (acc)
        scores[f"{str(model)}_bal_acc"] = (balanced_acc)

    return scores

In [190]:
models = [
    LogisticRegression(max_iter=500), 
    SVC(kernel='poly'),
    SVC(kernel='rbf'),
    SVC(kernel='linear'),
    KNeighborsClassifier(n_neighbors=5), 
    RandomForestClassifier(n_estimators=100), 
    SGDClassifier(loss="hinge", penalty="l2", max_iter=500), 
    GradientBoostingClassifier()]

In [191]:
# Pre processing:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
compared_scores = compare_models(models=models, X=X_scaled, y=y)

print(compared_scores)

{'LogisticRegression(max_iter=500)_roc': array([0.55008239, 0.61940913, 0.66996234, 0.63265066, 0.63053202,
       0.56743853, 0.6440678 , 0.76923077, 0.75997151, 0.77801519]), 'LogisticRegression(max_iter=500)_acc': array([0.50526316, 0.62105263, 0.65263158, 0.65263158, 0.61578947,
       0.58730159, 0.67195767, 0.74603175, 0.71428571, 0.67724868]), 'LogisticRegression(max_iter=500)_bal_acc': array([0.52860169, 0.5622646 , 0.57144539, 0.5768597 , 0.55261299,
       0.54326808, 0.64472428, 0.71741453, 0.70512821, 0.71794872]), "SVC(kernel='poly')_roc": array([0.66448917, 0.61164077, 0.68502825, 0.66784369, 0.63735876,
       0.63248985, 0.71556457, 0.76863723, 0.81635802, 0.6938509 ]), "SVC(kernel='poly')_acc": array([0.65263158, 0.61578947, 0.63157895, 0.63684211, 0.63684211,
       0.64021164, 0.67195767, 0.73544974, 0.77248677, 0.64021164]), "SVC(kernel='poly')_bal_acc": array([0.54166667, 0.51471281, 0.51659605, 0.5289548 , 0.52624765,
       0.53795655, 0.59142994, 0.68482906, 0.7