In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import gboost

In [9]:
def compare(y1, y2):
    score = 0
    for i in range(y1.shape[0]):
        if round(y1[i]) == y2[i]:
            score += 1        
    return score / y1.shape[0]

In [13]:
data_white = pd.read_csv("data/winequality-white.csv", delimiter=';')
data_white.insert(0, 'color', 0)
data_red = pd.read_csv("data/winequality-red.csv", delimiter=';')
data_red.insert(0, 'color', 1)

X_all = data_white.append(data_red)
y_all = X_all["quality"]
X_all.drop(labels="quality", axis=1, inplace=True)

print("Qualities occurencies")
print(y_all.value_counts())

state = 2
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=test_size, random_state=state)

gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.47, max_depth=3, random_state=0)
gb_clf.fit(X_train, y_train)

print("Scikit-learn accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
print("Scikit-learn accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

gb = gboost.GradientBoostingClassifier(n_estimators=100, learning_rate=0.47, max_depth=3)
gb.fit(np.array(X_train), np.array(y_train))
yy_train = gb.predict(np.array(X_train))
yy_test = gb.predict(np.array(X_test))

print("Our accuracy score (training): {0:.3f}".format(compare(yy_train, np.array(y_train))))
print("Our accuracy score (validation): {0:.3f}".format(compare(yy_test, np.array(y_test))))

Qualities occurencies
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64
Scikit-learn accuracy score (training): 0.877
Scikit-learn accuracy score (validation): 0.616
Our accuracy score (training): 0.751
Our accuracy score (validation): 0.579


In [7]:
yy_train

array([5.35851358, 5.75556905, 5.81052265, ..., 5.45903262, 6.50708069,
       6.00803387])

In [8]:
y_train

4182    6
1912    6
1993    6
2085    6
3399    6
       ..
2514    7
1545    6
3606    5
806     7
2575    6
Name: quality, Length: 4547, dtype: int64

In [None]:
# X_test

In [None]:
grad = gboost.GradientBoostingClassifier(n_estimators=30, verbosity=1)

In [None]:
X = np.array([[0., 0.], [1., 0.], [0.5, 0.5], [-0.25, 0.25], [2., 2.], [2.25, 1.75], [2.5, 2.]])
y = np.array([1, 1, 1, 1, 2, 2, 2])
grad.fit(X, y)

In [None]:
print(grad.predict(X))
print(grad.predict(np.array([[0.75, 1.75]])))