# Boosting - XGboost


## Adaboost

In [3]:
# Descarga la base de datos
import pandas as pd
import numpy as np

data = pd.read_csv('https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/churn.csv')

# Crear X and y

# Seleccionar solo las variables numéricas
X = data.iloc[:, [1,2,6,7,8,9,10]].astype(np.float)

# Convertir bools a floats
X = X.join((data.iloc[:, [4,5]] == 'no').astype(np.float))

y = (data.iloc[:, -1] == 'True.').astype(np.int)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=40)
n_samples = X_train.shape[0]

In [4]:
n_estimators = 10
weights = pd.DataFrame(index=X_train.index, columns=list(range(n_estimators)))

In [5]:
t = 0
weights[t] = 1 / n_samples

In [6]:
weights.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2953,0.000448,,,,,,,,,
617,0.000448,,,,,,,,,
26,0.000448,,,,,,,,,
853,0.000448,,,,,,,,,
2510,0.000448,,,,,,,,,


Entrenar el clasificador

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
trees = []
trees.append(DecisionTreeClassifier(max_depth=1))
trees[t].fit(X_train, y_train, sample_weight=weights[t].values)

DecisionTreeClassifier(max_depth=1)

Estimar el error

In [8]:
y_pred_ = trees[t].predict(X_train)
error = []
error.append(1 - metrics.balanced_accuracy_score(y_pred_, y_train, weights[t].values))
error[t]



0.24114832535884934

In [9]:
alpha = []
alpha.append(np.log((1 - error[t]) / error[t])/2)
alpha[t]

0.5731970670617188

Actualizar los pesos

In [10]:
weights[t + 1] = weights[t]
filter_ = y_pred_ != y_train

In [11]:
filter_

2953    False
617     False
26      False
853     False
2510    False
        ...  
1330    False
3064    False
2213     True
2055    False
2267    False
Name: Churn?, Length: 2233, dtype: bool

In [12]:
weights.loc[filter_, t + 1] = weights.loc[filter_, t] * np.exp(alpha[t])

Normalizar los pesos

In [13]:
weights[t + 1] = weights[t + 1] / weights[t + 1].sum()

In [14]:
weights.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2953,0.000448,0.000406,,,,,,,,
617,0.000448,0.000406,,,,,,,,
26,0.000448,0.000406,,,,,,,,
853,0.000448,0.000406,,,,,,,,
2510,0.000448,0.000406,,,,,,,,


**Iteracióm 2 - n_estimators**

In [15]:
for t in range(1, n_estimators):
    # Train
    trees.append(DecisionTreeClassifier(max_depth=1))
    trees[t].fit(X_train, y_train, sample_weight=weights[t].values)
    y_pred_ = trees[t].predict(X_train)
    # Balanced Error
    error.append(1 - metrics.balanced_accuracy_score(y_pred_, y_train, weights[t].values))
    # Alpha
    alpha.append(np.log((1 - error[t]) / error[t]) / 2)
    # Update Weights
    weights[t + 1] = weights[t]
    filter_ = y_pred_ != y_train
    weights.loc[filter_, t + 1] = weights.loc[filter_, t] * np.exp(alpha[t])
    weights[t + 1] = weights[t + 1] / weights[t + 1].sum()



In [16]:
error

[0.24114832535884934,
 0.31085674971292176,
 0.26303609328491306,
 0.3509920019261563,
 0.3778663388376744,
 0.3908839674420268,
 0.436104330420925,
 0.4291168064851212,
 0.229599990460706,
 0.38855949425356906]

In [17]:
weights.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
2953,0.000448,0.000406,0.000369,0.000313,0.000281,0.000254,0.000231,0.000248,0.000267,0.000194,0.000221
617,0.000448,0.000406,0.000369,0.000313,0.000281,0.000254,0.000231,0.000248,0.000267,0.000194,0.000221
26,0.000448,0.000406,0.000369,0.000313,0.000281,0.000254,0.000231,0.000218,0.000204,0.000148,0.000169
853,0.000448,0.000406,0.000369,0.000313,0.000281,0.000254,0.000231,0.000248,0.000267,0.000194,0.000221
2510,0.000448,0.000406,0.000369,0.000313,0.000281,0.000254,0.000231,0.000248,0.000267,0.000194,0.000221


### Construir la clasifica classificación

Solo clasificadores cuando el error es <0.5

In [18]:
new_n_estimators = np.sum([x<0.5 for x in error])

In [19]:
y_pred_all = np.zeros((X_test.shape[0], new_n_estimators))
for t in range(new_n_estimators):
    y_pred_all[:, t] = trees[t].predict(X_test)

In [20]:
y_pred = (np.sum(y_pred_all * alpha[:new_n_estimators], axis=1) >= 1).astype(np.int)

In [21]:
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

(0.48184818481848185, 0.8572727272727273)

In [22]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

(0.35031847133757965, 0.8145454545454546)

### Usando sklearn

In [23]:
from sklearn.ensemble import AdaBoostClassifier

In [25]:
clf = AdaBoostClassifier()
clf

AdaBoostClassifier()

In [26]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

(0.36771300448430494, 0.8718181818181818)

### Gradient Boosting


In [27]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf

GradientBoostingClassifier()

In [28]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

(0.4666666666666666, 0.8981818181818182)

# Mejoras al Gradient Boosting básico : XGBoost

 Nota: Instalar la librería, pip install xgboost

In [29]:
from xgboost import XGBClassifier

clf = XGBClassifier()
clf

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [30]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)





(0.4298245614035088, 0.8818181818181818)