# gboost.GradientBoostingClassifier and sklearn.ensemble.GradientBoostingClassifier comparision

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import gboost
import time

## Helper functions

In [2]:
def compare(y1, y2, tolerance=0):
    score = 0
    for i in range(y1.shape[0]):
        if abs(round(y1[i]) - y2[i]) <= tolerance:
            score += 1        
    return score / y1.shape[0]

In [3]:
def load_data(test_size=0.3, wine_type='all', verbosity=0):
    """
    params:
    test_size: for train test spli
    wine_type: all / white / red
    verbosity
    """
    
    if wine_type in ['all', 'white']:
        data_white = pd.read_csv("data/winequality-white.csv", delimiter=';')
        
    if wine_type in ['all', 'red']:
        data_red = pd.read_csv("data/winequality-red.csv", delimiter=';')                        
        
    if wine_type == 'all':
        data_white.insert(0, 'color', 0)
        data_red.insert(0, 'color', 1)  
        
    if wine_type == 'all':
        X_all = data_white.append(data_red)
    elif wine_type == 'white':
        X_all = data_white
    elif wine_type == 'red':
        X_all = data_red
    else:
        raise('Wring wine_type. Possible types: all, white, red.')                 
        
    y_all = X_all["quality"]
    X_all.drop(labels="quality", axis=1, inplace=True)

    if verbosity > 0:        
        
        print('Wine data size:', X_all.shape)
        print('Wine data columns:', X_all.columns.values)        
        print("Qualities occurencies:")
        print(y_all.value_counts())

    return train_test_split(X_all, y_all, test_size=test_size)    

## Parameters

In [4]:
n_estimators = 40
learning_rate = 0.05
max_depth = 6

## Comparision

In [5]:
X_train, X_test, y_train, y_test = load_data(test_size=0.3, wine_type='all', verbosity=1)

Wine data size: (6497, 12)
Wine data columns: ['color' 'fixed acidity' 'volatile acidity' 'citric acid' 'residual sugar'
 'chlorides' 'free sulfur dioxide' 'total sulfur dioxide' 'density' 'pH'
 'sulphates' 'alcohol']
Qualities occurencies:
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64


In [6]:
start_time = time.time()
gb_sklear = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
gb_sklear.fit(X_train, y_train)
y_sklearn = gb_sklear.predict(X_test)
print('Sklearn time: {:.2f} seconds'.format(time.time() - start_time))

Sklearn time: 6.45 seconds


In [7]:
start_time = time.time()
gb_gboost = gboost.GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, verbosity=0)
gb_gboost.fit(np.array(X_train), np.array(y_train))
y_gboost = gb_gboost.predict(np.array(X_test))
print('gboost time: {:.2f} seconds'.format(time.time() - start_time))

gboost time: 302.28 seconds


In [8]:
print('Sklearn accuracy: {:.2f} %'.format(compare(y_sklearn, np.array(y_test))*100))
print('gboost accuracy: {:.2f} %'.format(compare(y_gboost, np.array(y_test))*100))

Sklearn accuracy: 61.85 %
gboost accuracy: 49.74 %


In [9]:
print('Sklearn accuracy with tolerance of 1: {:.2f} %'.format(compare(y_sklearn, np.array(y_test), 1)*100))
print('gboost accuracy with tolerance of 1: {:.2f} %'.format(compare(y_gboost, np.array(y_test), 1)*100))

Sklearn accuracy with tolerance of 1: 95.54 %
gboost accuracy with tolerance of 1: 93.03 %
