# eXtreme Gradient Boosting (XGBoost)

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import GridSearchCV

diabetes = pd.read_csv('diabetes.csv')
df = diabetes.copy()
df = df.dropna()
y = df['Outcome']
X = df.drop('Outcome', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=238)

In [2]:
from xgboost import XGBClassifier

In [3]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [4]:
y_train_pred = xgb_model.predict(X_train)
acc_train = accuracy_score(y_train_pred, y_train)
acc_train

1.0

In [5]:
y_test_pred = xgb_model.predict(X_test)
acc_test = accuracy_score(y_test_pred, y_test)
acc_test

0.7445887445887446

## Model Tuning

In [6]:
xgb_params = {
    'n_estimators': [100, 500, 1000, 2000],
    'subsample': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.02, 0.05],
    'min_samples_split': [2, 5, 10]
}

In [7]:
xgb = XGBClassifier()
xgb_cv = GridSearchCV(xgb, xgb_params, cv=10, n_jobs=-1, verbose=2)
xgb_cv.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


Parameters: { "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
xgb_cv.best_params_

{'learning_rate': 0.02,
 'max_depth': 4,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.8}

In [20]:
xgb_tuned = XGBClassifier(learning_rate=0.02,
                          max_depth=4,
                          n_estimators=100,
                          subsample=0.8)

In [21]:
xgb_tuned = xgb.fit(X_train, y_train)

In [22]:
y_train_pred = xgb_tuned.predict(X_train)
acc_train = accuracy_score(y_train_pred, y_train)
acc_train

1.0

In [23]:
y_test_pred = xgb_tuned.predict(X_test)
acc_test = accuracy_score(y_test_pred, y_test)
acc_test

0.7445887445887446