In [None]:
#obligatorios
import pandas as pd
import numpy as np


#modelos
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

#crear modelos custom
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.utils.estimator_checks import check_estimator


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

In [None]:
#wrappers 
class LogisticRegression_Threshold(BaseEstimator, ClassifierMixin):

     def __init__(self,
                  threshold=0.5,
                  C=1.0,
                  class_weight= None,
                  dual= False,
                  fit_intercept= True,
                  intercept_scaling= 1,
                  l1_ratio= None,
                  max_iter= 100,
                  multi_class= 'auto',
                  n_jobs= None,
                  penalty= 'l2',
                  random_state= None,
                  solver= 'lbfgs',
                  tol= 0.0001,
                  verbose= 0,
                  warm_start= False):
        self.threshold=threshold
        self.C=C
        self.class_weight=class_weight
        self.dual=dual
        self.fit_intercept=fit_intercept
        self.intercept_scaling=intercept_scaling
        self.l1_ratio=l1_ratio
        self.max_iter=max_iter
        self.multi_class=multi_class
        self.n_jobs=n_jobs
        self.penalty=penalty
        self.random_state=random_state
        self.solver=solver
        self.tol=tol
        self.verbose=verbose
        self.warm_start=warm_start


     def fit(self, X, y):
         X, y = check_X_y(X, y)
         # Store the classes seen during fit
         self.classes_ = unique_labels(y)

         self.X_ = X
         self.y_ = y
         # Return the classifier
         return self

     def predict(self, X):

         # Check is fit had been called
         check_is_fitted(self)

         # Input validation
         X = check_array(X)
         if len(self.classes_) == 1:
           return np.array([self.y_[0]]*len(X))
         else:
           dicc=self.get_params()
           dicc.pop('threshold')
           logit=LogisticRegression(**dicc)
           logit.fit(self.X_,self.y_)
           lr_probs=logit.predict_proba(X)
           return (lr_probs[:,1]>self.threshold).astype(float)
     def predict_proba(self, X):

         # Check is fit had been called
         check_is_fitted(self)

         # Input validation
         X = check_array(X)
         if len(self.classes_) == 1:
           return np.array([self.y_[0]]*len(X))
         else:
           dicc=self.get_params()
           dicc.pop('threshold')
           logit=LogisticRegression(**dicc)
           logit.fit(self.X_,self.y_)
           lr_probs=logit.predict_proba(X)
           return lr_probs
     def get_params(self, deep=True):
    # suppose this estimator has parameters "alpha" and "recursive"
         return{'threshold':self.threshold,
          'C': self.C,
          'class_weight': self.class_weight,
          'dual': self.dual,
          'fit_intercept': self.fit_intercept,
          'intercept_scaling': self.intercept_scaling,
          'l1_ratio': self.l1_ratio,
          'max_iter': self.max_iter,
          'multi_class': self.multi_class,
          'n_jobs': self.n_jobs,
          'penalty': self.penalty,
          'random_state': self.random_state,
          'solver': self.solver,
          'tol': self.tol,
          'verbose': self.verbose,
          'warm_start': self.warm_start}

     def set_params(self, **parameters):
         for parameter, value in parameters.items():
             setattr(self, parameter, value)
         return self
class XGB_Threshold(BaseEstimator, ClassifierMixin):

     def __init__(self,
                  threshold=0.5,
                  base_score= 0.5,
                  booster= 'gbtree',
                  colsample_bylevel= 1,
                  colsample_bynode= 1,
                  colsample_bytree= 1,
                  gamma= 0,
                  learning_rate= 0.1,
                  max_delta_step= 0,
                  max_depth= 3,
                  min_child_weight= 1,
                  missing= None,
                  n_estimators= 100,
                  n_jobs= 1,
                  nthread= None,
                  objective= 'binary:logistic',
                  random_state= 0,
                  reg_alpha= 0,
                  reg_lambda= 1,
                  scale_pos_weight= 1,
                  seed= None,
                  silent= None,
                  subsample= 1,
                  verbosity= 1):
        self.threshold=threshold
        self.base_score=base_score
        self.booster=booster
        self.colsample_bylevel=colsample_bylevel
        self.colsample_bynode=colsample_bynode
        self.colsample_bytree=colsample_bytree
        self.gamma=gamma
        self.learning_rate=learning_rate
        self.max_delta_step=max_delta_step
        self.max_depth=max_depth
        self.min_child_weight=min_child_weight
        self.missing=missing
        self.n_estimators=n_estimators
        self.n_jobs=n_jobs
        self.nthread=nthread
        self.objective=objective
        self.random_state=random_state
        self.reg_alpha=reg_alpha
        self.reg_lambda=reg_lambda
        self.scale_pos_weight=scale_pos_weight
        self.seed=seed
        self.silent=silent
        self.subsample=subsample
        self.verbosity=verbosity


     def fit(self, X, y):
         X, y = check_X_y(X, y)
         # Store the classes seen during fit
         self.classes_ = unique_labels(y)

         self.X_ = X
         self.y_ = y
         # Return the classifier
         return self

     def predict(self, X):

         # Check is fit had been called
         check_is_fitted(self)

         # Input validation
         X = check_array(X)
         if len(self.classes_) == 1:
           return np.array([self.y_[0]]*len(X))
         else:
           dicc=self.get_params()
           dicc.pop('threshold')
           logit=xgb.XGBClassifier(**dicc)
           logit.fit(self.X_,self.y_)
           xgb_probs=logit.predict_proba(X)
           return (xgb_probs[:,1]>self.threshold).astype(float)

     def predict_proba(self, X):
       # Check is fit had been called
         check_is_fitted(self)

         # Input validation
         X = check_array(X)
         if len(self.classes_) == 1:
           return np.array([self.y_[0]]*len(X))
         else:
           dicc=self.get_params()
           dicc.pop('threshold')
           logit=xgb.XGBClassifier(**dicc)
           logit.fit(self.X_,self.y_)
           xgb_probs=logit.predict_proba(X)
           return xgb_probs
     def get_params(self, deep=True):
    # suppose this estimator has parameters "alpha" and "recursive"
         return{'threshold':self.threshold,
                'base_score': self.base_score,
                'booster': self.booster,
                'colsample_bylevel': self.colsample_bylevel,
                'colsample_bynode': self.colsample_bynode,
                'colsample_bytree': self.colsample_bytree,
                'gamma': self.gamma,
                'learning_rate': self.learning_rate,
                'max_delta_step': self.max_delta_step,
                'max_depth': self.max_depth,
                'min_child_weight': self.min_child_weight,
                'missing': self.missing,
                'n_estimators': self.n_estimators,
                'n_jobs': self.n_jobs,
                'nthread': self.nthread,
                'objective': self.objective,
                'random_state': self.random_state,
                'reg_alpha': self.reg_alpha,
                'reg_lambda': self.reg_lambda,
                'scale_pos_weight': self.scale_pos_weight,
                'seed': self.seed,
                'silent': self.silent,
                'subsample': self.subsample,
                'verbosity': self.verbosity}

     def set_params(self, **parameters):
         for parameter, value in parameters.items():
             setattr(self, parameter, value)
         return self

In [None]:
#XGBoost + PCA
pca = PCA()
# set the tolerance to a large value to make the example faster
modelo3 = XGB_Threshold()
pipe = Pipeline(steps=[('pca', pca), ('xgb', modelo3)])
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': np.append(np.arange(5,num_cols, 5),num_cols),
    'xgb__max_depth': [3, 8],
    'xgb__n_estimators':[100, 500,1000],
    'xgb__learning_rate':[0.01,0.05,0.1],
    'xgb__threshold':np.round(np.arange(0.4, 0.8, 0.05),3)
}
clf3 = GridSearchCV(pipe, param_grid,cv=10,scoring=scoring,refit='specificity')
clf3.fit(x, y.APROBO)
tk=pd.DataFrame(clf3.cv_results_)[['params','mean_test_accuracy','std_test_accuracy','mean_test_sensitivity','std_test_sensitivity','mean_test_specificity','std_test_specificity','mean_test_j_score','std_test_j_score']]
tk['model']=np.array(['XGB+PCA']*len(tk))
df_resultados=df_resultados.append(tk)

# #LR Threshold
parameters = {'threshold':np.round(np.arange(0.4, 0.8, 0.05),3),'C':np.logspace(-4, 4, 4)}
logit = LogisticRegression(max_iter=10000)
xI=aplicar_rfe(logit,x,y.APROBO)
modelo4 = LogisticRegression_Threshold(max_iter=10000)
clf4 = GridSearchCV(modelo4, parameters,cv=10,scoring=scoring,refit='specificity')
clf4.fit(x[xI.columns], y.APROBO)
tk=pd.DataFrame(clf4.cv_results_)[['params','mean_test_accuracy','std_test_accuracy','mean_test_sensitivity','std_test_sensitivity','mean_test_specificity','std_test_specificity','mean_test_j_score','std_test_j_score']]
tk['model']=np.array(['LRT']*len(tk))
df_resultados=df_resultados.append(tk)