# Libs:

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge

from sklearn.metrics import mean_squared_error as MSE

from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier

In [2]:
X, y = load_diabetes(return_X_y=True)

In [10]:
kfold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=2
)

# regression_model

In [4]:
def regression_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)

    # Take square root of the scores
    rmse = (-scores)**0.5

    # Return mean score
    return rmse.mean()

In [5]:
regression_model(XGBRegressor(
    booster='gblinear',
    objective='reg:squarederror'
))

55.497534109140396

In [6]:
regression_model(LinearRegression())

55.50936875436023

In [7]:
regression_model(Lasso())

62.64904114426351

In [8]:
regression_model(Ridge())

58.835292374356676

In [9]:
regression_model(XGBRegressor(
    booster='gbtree',
    objective='reg:squarederror'
))

65.9125519300286

# grid_search

In [11]:
def grid_search(
    params,
    reg=XGBRegressor(booster='gblinear',
                     objective='reg:squarederror')):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg,
                            params,
                            scoring='neg_mean_squared_error',
                            cv=kfold)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X, y)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Best score:", best_score)

In [12]:
grid_search(
    params={
        'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]
    })

Best params: {'reg_alpha': 0.01}
Best score: 55.485563465849076


In [13]:
grid_search(
    params={
        'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]
    })

Best params: {'reg_lambda': 0.001}
Best score: 56.171699754180274


In [14]:
grid_search(
    params={
        'feature_selector':['shuffle']
    })

Best params: {'feature_selector': 'shuffle'}
Best score: 55.51900448347787


In [15]:
grid_search(
    params={
        'feature_selector':['random', 'greedy', 'thrifty'],
        'updater':['coord_descent']
    })

Best params: {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best score: 55.488143951136536


In [17]:
grid_search(
    params={
        'feature_selector':['greedy', 'thrifty'],
        'updater':['coord_descent'], 
        'top_k':[3, 5, 7, 9]
    })

Best params: {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best score: 55.47871836076556


In [26]:
X = np.arange(1,100)
np.random.seed(2) 
y = []
for i in X:
    y.append(i*np.random.uniform(-0.2, 0.2))
y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

# regression_model

In [19]:
regression_model(
    XGBRegressor(booster='gblinear',
                 objective='reg:squarederror'
                ))

6.214946302686011

In [20]:
regression_model(
    XGBRegressor(
        booster='gbtree',
        objective='reg:squarederror'
    ))

9.372359516507444

In [21]:
regression_model(LinearRegression())

6.214962315808842

In [22]:
X, y = load_diabetes(return_X_y=True)

In [25]:
regression_model(
    XGBRegressor(
        booster='dart',
        objective='reg:squarederror'
    ))

65.91255196051148

# Classification model

In [27]:
df = pd.read_csv('/home/antonius/Projects/DS_Projects/learn_XGBoost/data/census_cleaned.csv')
df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X_census = df.iloc[:, :-1]
y_census = df.iloc[:, -1]

In [31]:
def classification_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(
        model,
        X_census,
        y_census,
        scoring='accuracy',
        cv=kfold)

    # Return mean score
    return scores.mean()

In [32]:
classification_model(XGBClassifier(booster='gbtree'))

0.8701208195968675

In [33]:
classification_model(XGBClassifier(booster='dart'))

0.8701208195968675

In [34]:
classification_model(XGBClassifier(booster='gblinear'))

0.8503425628425628

In [35]:
classification_model(LogisticRegression(max_iter=1000))

0.7968120977851517

In [36]:
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8726699407837133

In [37]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted'))

9.372359645069041

In [38]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', normalize_type='forest'))

9.372359645069041

In [39]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

9.189010699761651

In [40]:
grid_search(params={'rate_drop':[0.01, 0.1, 0.2, 0.4]}, 
            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

Best params: {'rate_drop': 0.4}
Best score: 7.59319045349027


In [41]:
grid_search(params={'skip_drop': [0.01, 0.1, 0.2, 0.4]}, 
            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

Best params: {'skip_drop': 0.01}
Best score: 9.371246409405156


In [42]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=25))

9.372927761679424

In [43]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=5))

9.372359092157916

In [44]:
regression_model(XGBRFRegressor(objective='reg:squarederror'))

7.618378917775955

In [45]:
regression_model(RandomForestRegressor())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


8.036676464631038

In [46]:
classification_model(XGBRFClassifier())

0.8552564498672283

In [47]:
classification_model(RandomForestClassifier())

0.8555328202034789