In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingRegressor, BaggingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, r2_score, mean_absolute_error, mean_squared_error, classification_report, plot_confusion_matrix, accuracy_score, max_error
from scipy.stats import reciprocal, randint
from sklearn.linear_model import LinearRegression,  Ridge, Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('winequality-white.csv', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [3]:
X = df.drop(columns='quality')
y = df.quality

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Modelos Simples

## Regresión Lineal

In [5]:
lr = Pipeline([
                ('scale', StandardScaler()),
                ('linear', LinearRegression())
              ])
lr.fit(X_train, y_train)

r2_train = r2_score(y_train, lr.predict(X_train))
r2_val = r2_score(y_val, lr.predict(X_val))

mae_train = mean_absolute_error(y_train, lr.predict(X_train))
mae_val = mean_absolute_error(y_val, lr.predict(X_val))

mse_train = mean_squared_error(y_train, lr.predict(X_train))
mse_val = mean_squared_error(y_val, lr.predict(X_val))

max_error_train = max_error(y_train, lr.predict(X_train))
max_error_val = max_error(y_val, lr.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.2839219313754503
R2 val 0.2665242126890309
MAE train 0.5879082700331394
MAE val 0.5676484502513972
MSE train 0.5719423542728527
MSE val 0.5494317128322659
Max error train 3.625759124302122
Max error val 4.476680978256095


## Regresión Polinómica

In [6]:
%%time
poly_pipe = Pipeline([
                        ('poly', PolynomialFeatures()),
                        ('scale', StandardScaler()),
                        ('lr', LinearRegression())
                     ])
grid = {'poly__degree':range(2,10)}

poly_reg = GridSearchCV(poly_pipe, param_grid=grid)
poly_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, poly_reg.predict(X_train))
r2_val = r2_score(y_val, poly_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, poly_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, poly_reg.predict(X_val))

mse_train = mean_squared_error(y_train, poly_reg.predict(X_train))
mse_val = mean_squared_error(y_val, poly_reg.predict(X_val))

max_error_train = max_error(y_train, poly_reg.predict(X_train))
max_error_val = max_error(y_val, poly_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.368544412962975
R2 val 0.3464345646957122
MAE train 0.5532320725294384
MAE val 0.5410996429553447
MSE train 0.5043531018376458
MSE val 0.48957250229578253
Max error train 3.33063000599111
Max error val 3.9296411834906486
Wall time: 47min 5s


In [7]:
poly_reg.best_params_

{'poly__degree': 2}

## Árbol de Decisión

In [8]:
%%time
tree = DecisionTreeRegressor()
dists = {'max_depth':range(2, 100),'min_samples_leaf': range(10, 1000)}

tree_reg = RandomizedSearchCV(tree, param_distributions=dists, n_iter=100)
tree_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, tree_reg.predict(X_train))
r2_val = r2_score(y_val, tree_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, tree_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, tree_reg.predict(X_val))

mse_train = mean_squared_error(y_train, tree_reg.predict(X_train))
mse_val = mean_squared_error(y_val, tree_reg.predict(X_val))

max_error_train = max_error(y_train, tree_reg.predict(X_train))
max_error_val = max_error(y_val, tree_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.3807452022153829
R2 val 0.34542091079897586
MAE train 0.5471870968510665
MAE val 0.5498043903351268
MSE train 0.49460814743286596
MSE val 0.49033180970079554
Max error train 3.2921348314606744
Max error val 3.2295081967213113
Wall time: 4.39 s


## Máquina de Soporte Vectorial (SVM)

In [9]:
%%time
svm_pipe = Pipeline([
                        ('scale', StandardScaler()),
                        ('svr', SVR(kernel='rbf'))
                     ])
dists = {'svr__gamma':reciprocal(0.01, 100),'svr__C': reciprocal(1e-4, 1e4)}

svm_reg = RandomizedSearchCV(svm_pipe, param_distributions = dists, n_iter=100)
svm_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, svm_reg.predict(X_train))
r2_val = r2_score(y_val, svm_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, svm_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, svm_reg.predict(X_val))

mse_train = mean_squared_error(y_train, svm_reg.predict(X_train))
mse_val = mean_squared_error(y_val, svm_reg.predict(X_val))

max_error_train = max_error(y_train, svm_reg.predict(X_train))
max_error_val = max_error(y_val, svm_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.4274882733715929
R2 val 0.4020091111534779
MAE train 0.5027557536639248
MAE val 0.5093442163490883
MSE train 0.4572737514578883
MSE val 0.4479427460333294
Max error train 3.3579652236327284
Max error val 3.2834340515694027
Wall time: 1h 45min 48s


In [10]:
svm_reg.best_params_

{'svr__C': 1.5183114915037463, 'svr__gamma': 0.03912133133063292}

# Modelos de Ensamble

## Bagging de regresiones lineales

In [11]:
%%time
bagg_linear_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('bagg', BaggingRegressor(base_estimator=LinearRegression(), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_linear = GridSearchCV(bagg_linear_pipe, param_grid=grid)
bagg_linear.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_linear.predict(X_train))
r2_val = r2_score(y_val, bagg_linear.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_linear.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_linear.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_linear.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_linear.predict(X_val))

max_error_train = max_error(y_train, bagg_linear.predict(X_train))
max_error_val = max_error(y_val, bagg_linear.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.2838244655998806
R2 val 0.2682219229871431
MAE train 0.5878226900538344
MAE val 0.5670755066527102
MSE train 0.5720202016579115
MSE val 0.5481599927658075
Max error train 3.6262440534714004
Max error val 4.319635557576467
Wall time: 1min 42s


## Bagging de regresion polinomial de grado 2

In [12]:
%%time
bagg_poly_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('poly', PolynomialFeatures(degree=2)),
                                ('bagg', BaggingRegressor(base_estimator=LinearRegression(), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_poly = GridSearchCV(bagg_poly_pipe, param_grid=grid)
bagg_poly.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_poly.predict(X_train))
r2_val = r2_score(y_val, bagg_poly.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_poly.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_poly.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_poly.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_poly.predict(X_val))

max_error_train = max_error(y_train, bagg_poly.predict(X_train))
max_error_val = max_error(y_val, bagg_poly.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.3670417400954804
R2 val 0.3076143847412286
MAE train 0.5533754818683392
MAE val 0.5432010449739845
MSE train 0.5055533093222678
MSE val 0.5186519052342818
Max error train 3.3350948567657275
Max error val 6.655283978550139
Wall time: 8min 5s


## Random Forest Regressor

In [13]:
%%time
forest = RandomForestRegressor()
dists = {'n_estimators':range(20, 100), 'max_depth':(2, 50), 'min_samples_leaf':range(2, 50)}

forest_reg = RandomizedSearchCV(forest, param_distributions=dists, n_iter=100)
forest_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, forest_reg.predict(X_train))
r2_val = r2_score(y_val, forest_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, forest_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, forest_reg.predict(X_val))

mse_train = mean_squared_error(y_train, forest_reg.predict(X_train))
mse_val = mean_squared_error(y_val, forest_reg.predict(X_val))

max_error_train = max_error(y_train, forest_reg.predict(X_train))
max_error_val = max_error(y_val, forest_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.8757689977935806
R2 val 0.5192453335808086
MAE train 0.22542987816103469
MAE val 0.44354727149853823
MSE train 0.09922517528304525
MSE val 0.360123489271792
Max error train 1.922783251231527
Max error val 3.4637931034482756
Wall time: 3min 9s


## Bagging de SVM

In [14]:
%%time
bagg_svm_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('bagg', BaggingRegressor(base_estimator=SVR(kernel='rbf', C=0.7, gamma=0.05), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_svm = GridSearchCV(bagg_svm_pipe, param_grid=grid)
bagg_svm.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_svm.predict(X_train))
r2_val = r2_score(y_val, bagg_svm.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_svm.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_svm.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_svm.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_svm.predict(X_val))

max_error_train = max_error(y_train, bagg_svm.predict(X_train))
max_error_val = max_error(y_val, bagg_svm.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.40472932256784233
R2 val 0.39972148154206355
MAE train 0.5202903308619093
MAE val 0.5095550128078472
MSE train 0.47545166874626427
MSE val 0.4496563625936431
Max error train 3.221802398444485
Max error val 3.3039190707236905
Wall time: 42min 58s


## Gradient Boosting

In [16]:
%%time
boost_est = GradientBoostingRegressor()
dists = {'n_estimators':range(20, 100), 'max_depth':(2, 50), 'min_samples_leaf':range(2, 50)}

boost = RandomizedSearchCV(boost_est, param_distributions=dists, n_iter=100)
boost.fit(X_train, y_train)

r2_train = r2_score(y_train, boost.predict(X_train))
r2_val = r2_score(y_val, boost.predict(X_val))

mae_train = mean_absolute_error(y_train, boost.predict(X_train))
mae_val = mean_absolute_error(y_val, boost.predict(X_val))

mse_train = mean_squared_error(y_train, boost.predict(X_train))
mse_val = mean_squared_error(y_val, boost.predict(X_val))

max_error_train = max_error(y_train, boost.predict(X_train))
max_error_val = max_error(y_val, boost.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.9219422121863169
R2 val 0.5205215619906454
MAE train 0.1787870555851074
MAE val 0.43481535255513853
MSE train 0.062345932500408034
MSE val 0.35916749266861514
Max error train 1.62656336138454
Max error val 3.571265808073341
Wall time: 4min 51s
