In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from functions import scale_variables
from time import perf_counter, gmtime, strftime

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import RepeatedKFold, train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay


In [2]:
data = pd.read_csv("../data/fda_approved/fda_rdkit_pre_processed_descriptors.csv")
data.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
Y = data["oral"]
Y = Y.astype(float)
Y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: oral, dtype: float64

In [4]:
X = data.iloc[:, 14:]
X.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.055556,-0.333333,6.055556,0.333333,0.242956,59.072,54.032,59.048347,24.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.451389,-0.439815,9.451389,0.439815,0.301333,75.067,70.027,75.032028,30.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.229167,-0.939815,9.229167,0.939815,0.256644,76.055,72.023,76.027277,30.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4.951389,0.683642,4.951389,0.683642,0.419021,77.152,70.096,77.02992,26.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.555556,-0.611111,9.555556,0.611111,0.373753,78.136,72.088,78.013936,26.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# separate train an test datasets

np.random.seed(42)
x_train, x_test, ytrain, ytest = train_test_split(X, Y, stratify=Y, test_size=0.25)

In [None]:
def best_params_grid(x, y, model_params, n_splits=5, n_repeats=10):
    '''DataFrame, DataFrame, Dictionary, int, int --> DataFrame

    -----------------------------------------------------------------------------

    This function recieves the X and Y dataframes. It also recieves a dictionary, 
    with the model and parameters to be testes. As the function uses RepeatedStratifiedKFold,
    it can also recieve the n_splits and n_repeats parameters.
    Tha function will execute GridSearchCV on all parameters for the model.
    It returns two datasets, the first one is formatted to contain only the relevant information.
    The second one constains all information about the models, in case it's necessary'''

    t0 = perf_counter()
    np.random.seed(1428)

    cv = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats = n_repeats)
    scores = []
    full_results = {}

    for model_name, mp in model_params.items():
        now = strftime("%H:%M", gmtime())
        print(f"Starting Grid Search for {model_name}: {now}")
        clf =  GridSearchCV(mp['model'], mp['params'], cv=cv,
                                return_train_score=False, scoring = 'roc_auc')
        clf.fit(x, y)
        scores.append({
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
        now = strftime("%H:%M", gmtime())
        print(f"Finished Grid Search for {model_name}: {now}")

        full_results[model_name] = pd.DataFrame(clf.cv_results_)
        
    important_result = pd.DataFrame(scores,columns=['model','best_score','best_params'])
    t1 = perf_counter() - t0
    print(f'Tempo de execução: {(t1/60): 0.1f} minutos')
    return important_result, full_results

In [None]:
model_params = {
    'BART': {
        'model': BART(),
        'params': {
            'num_trees': [100, 200],
            'num_particles': [10, 15],
        }
    }
}

In [None]:
grid_search_result, grid_search_full_results = best_params_grid(x_train, y_train, model_params)

In [None]:
grid_search_result

In [None]:
bart_params = grid_search_result.iloc[0,2]
bart_params

In [None]:
# TRAIN

x_train_scaled = scale_variables(x_train)

# adding constant
x_train_scaled = sm.add_constant(x_train_scaled)
7
x_train_scaled.head()

In [None]:
# TEST

x_test_scaled = scale_variables(x_test)

# adding constant
x_test_scaled = sm.add_constant(x_test_scaled)

x_test_scaled.head()

In [None]:
bart_params = {'num_particles': 10, 'num_trees': 100}

In [None]:
bart = BART(**bart_params)

In [None]:
bart = bart.fit(x_train_scaled, y_train)

In [None]:
x_test_float32 = x_test.astype(np.float32)

In [None]:
bart.predict(x_test_scaled)

# TESTE

In [7]:

# model = MS(X.columns, intercept=False)
# D = model.fit_transform(data)
# feature_names = list(D.columns)
X_1 = np.asarray(x_train)
x_2 = np.asarray(x_test)

In [None]:
(X_train,
X_test,
y_train,
y_test) = skm.train_test_split(X_1,
Y,
test_size=0.3,
random_state=0)

In [14]:
bart_boston = BART(random_state=0, burnin=5, ndraw=15)
bart_boston.fit(X_1, ytrain)

In [13]:
yhat_test = bart_boston.predict(x_test.astype(np.float32))
np.mean((ytest - yhat_test)**2)

IndexError: string index out of range

In [16]:
bart_boston.predict(x_2)

array([0.60731062, 0.42684545, 0.78745153, 0.76121864, 0.72188703,
       0.65032171, 0.54274969, 0.54440211, 0.65268966, 0.79276572,
       0.63289054, 0.72639464, 0.45840439, 0.63982636, 0.47552263,
       0.59683131, 0.8098377 , 0.68739877, 0.8341657 , 0.82123839,
       0.56951761, 0.71197668, 0.71778001, 0.50010644, 0.69742593,
       0.60296447, 0.69878391, 0.60388863, 0.67507918, 0.65184922,
       0.59721382, 0.6636022 , 0.64137948, 0.51397423, 0.66224026,
       0.70991171, 0.6910883 , 0.68231795, 0.52832857, 0.50745753,
       0.58805835, 0.85208705, 0.6286525 , 0.77968699, 0.75790965,
       0.56882797, 0.62400727, 0.56421737, 0.63536511, 0.47690345,
       0.56768179, 0.73284772, 0.68431504, 0.57111368, 0.47676696,
       0.7634618 , 0.70744213, 0.66694478, 0.49157343, 0.71666577,
       0.45055064, 0.75412016, 0.72684049, 0.66284235, 0.34010358,
       0.68404295, 0.66788786, 0.60798995, 0.59626335, 0.57311024,
       0.33499644, 0.71198159, 0.43921887, 0.77188714, 0.78468