In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier


  from numpy.core.umath_tests import inner1d


In [3]:
diabetes = pd.read_csv('https://goo.gl/CTq1hK', header=None)


In [4]:
diabetes.columns


Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')

In [5]:
diabetes.columns = ["num_preg", "glucose_conc", "diastolic_bp", "thickness", "insulin", "bmi", "diab_pred","age", "diabetes"]

In [6]:
X = diabetes.drop("diabetes", 1)


In [7]:
y = diabetes["diabetes"]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=1)


In [9]:
DTC = DecisionTreeClassifier(random_state=1)


In [13]:
print("Estimación del set de Validacion usando DTC")
print(cross_val_score(DTC, X_train, y_train, cv=5, scoring='roc_auc').mean())


Estimación del set de Validacion usando DTC
0.6891932534037798


In [14]:
DTC.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [15]:
y_test_pred = DTC.predict_proba(X_test)


In [17]:
print("Metrica AUC del set Test usando DTC")
print(roc_auc_score(y_test, y_test_pred[:, 1]))

Metrica AUC del set Test usando DTC
0.7062962962962963


###  Bagging


In [18]:
BAGG = BaggingClassifier(base_estimator=DTC, random_state=1) # Preparado para arboles de decisión

In [19]:
print("Estimación del set de Validacion usando BAGG")
print(cross_val_score(BAGG, X_train, y_train, cv=5, scoring='roc_auc').mean())


Estimación del set de Validacion usando BAGG
0.7957884576305629


In [20]:
BAGG.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=1, verbose=0, warm_start=False)

In [21]:
y_test_pred = BAGG.predict_proba(X_test)

In [22]:
print("Metrica AUC del set de Test usando BAGG") ## Random forest= mezcla bagging y boosting
print(roc_auc_score(y_test, y_test_pred[:, 1]))

Metrica AUC del set de Test usando BAGG
0.7853497942386831


### Boosting

In [23]:
from sklearn.ensemble import AdaBoostClassifier

In [24]:
BOOS = AdaBoostClassifier(random_state=1)

In [25]:
print("Estimación del set de Validacion usando BAGG")
print(cross_val_score(BOOS, X_train, y_train, cv=5, scoring='roc_auc').mean())


Estimación del set de Validacion usando BAGG
0.80578540946962


In [26]:
BOOS.fit(X_train, y_train)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1)

In [36]:
y_test_pred = BOOS.predict_proba(X_test)
y_test_pred

array([[0.51583892, 0.48416108],
       [0.30908633, 0.69091367],
       [0.51030145, 0.48969855],
       [0.53534856, 0.46465144],
       [0.51457404, 0.48542596],
       [0.52376292, 0.47623708],
       [0.48999639, 0.51000361],
       [0.50233647, 0.49766353],
       [0.49593504, 0.50406496],
       [0.52730225, 0.47269775],
       [0.50492683, 0.49507317],
       [0.48359366, 0.51640634],
       [0.5307984 , 0.4692016 ],
       [0.50240277, 0.49759723],
       [0.53524737, 0.46475263],
       [0.53365579, 0.46634421],
       [0.5189308 , 0.4810692 ],
       [0.51912418, 0.48087582],
       [0.51087934, 0.48912066],
       [0.52180918, 0.47819082],
       [0.50384004, 0.49615996],
       [0.5116353 , 0.4883647 ],
       [0.50094835, 0.49905165],
       [0.48028201, 0.51971799],
       [0.32451058, 0.67548942],
       [0.4842583 , 0.5157417 ],
       [0.50917958, 0.49082042],
       [0.44140635, 0.55859365],
       [0.50132361, 0.49867639],
       [0.48809309, 0.51190691],
       [0.

In [32]:
print("Metrica AUC del set de Test usando BOOS")
print(roc_auc_score(y_test, y_test_pred[:, 1]))


Metrica AUC del set de Test usando BOOS
0.8114814814814816


### Voting


In [37]:
from sklearn.ensemble import VotingClassifier

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [39]:
KNN = KNeighborsClassifier(n_neighbors=2)


In [40]:
LR = LogisticRegression(random_state=1)

In [41]:
VOTI = VotingClassifier(estimators=[("DTC", DTC), ("LR", LR), ("KNN", KNN)], voting='soft')


In [42]:
VOTI.fit(X_train, y_train)


VotingClassifier(estimators=[('DTC', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_lea...owski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [48]:
y_test_pred = VOTI.predict_proba(X_test)
y_test_pred

array([[0.92268162, 0.07731838],
       [0.4457285 , 0.5542715 ],
       [0.72332237, 0.27667763],
       [0.96868244, 0.03131756],
       [0.75528871, 0.24471129],
       [0.89695324, 0.10304676],
       [0.20718982, 0.79281018],
       [0.68461796, 0.31538204],
       [0.2102767 , 0.7897233 ],
       [0.94261242, 0.05738758],
       [0.91782592, 0.08217408],
       [0.35910359, 0.64089641],
       [0.96705603, 0.03294397],
       [0.87586104, 0.12413896],
       [0.97458625, 0.02541375],
       [0.94955396, 0.05044604],
       [0.93565161, 0.06434839],
       [0.92319284, 0.07680716],
       [0.72427594, 0.27572406],
       [0.79183112, 0.20816888],
       [0.72740927, 0.27259073],
       [0.92160558, 0.07839442],
       [0.37364527, 0.62635473],
       [0.08427559, 0.91572441],
       [0.63237194, 0.36762806],
       [0.5995832 , 0.4004168 ],
       [0.37531186, 0.62468814],
       [0.55369188, 0.44630812],
       [0.48240328, 0.51759672],
       [0.25887657, 0.74112343],
       [0.

In [52]:
print("Metrica AUC del set de Test usando Voting") 
print(roc_auc_score(y_test, y_test_pred[:, 1]))

Metrica AUC del set de Test usando Voting
0.7870781893004114
