In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingRegressor, BaggingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, r2_score, mean_absolute_error, mean_squared_error, classification_report, plot_confusion_matrix, accuracy_score, max_error
from scipy.stats import reciprocal, randint
from sklearn.linear_model import LinearRegression,  Ridge, Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('winequality-red.csv', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
X = df.drop(columns='quality')
y = df.quality

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Modelos Simples

## Regresión Lineal

In [6]:
lr = Pipeline([
                ('scale', StandardScaler()),
                ('linear', LinearRegression())
              ])
lr.fit(X_train, y_train)

r2_train = r2_score(y_train, lr.predict(X_train))
r2_val = r2_score(y_val, lr.predict(X_val))

mae_train = mean_absolute_error(y_train, lr.predict(X_train))
mae_val = mean_absolute_error(y_val, lr.predict(X_val))

mse_train = mean_squared_error(y_train, lr.predict(X_train))
mse_val = mean_squared_error(y_val, lr.predict(X_val))

max_error_train = max_error(y_train, lr.predict(X_train))
max_error_val = max_error(y_val, lr.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.3609234335849555
R2 val 0.28359035774129715
MAE train 0.5010333783031675
MAE val 0.500497880170689
MSE train 0.41774665284129203
MSE val 0.45468924863315785
Max error train 2.721971891101168
Max error val 2.6711328589836043


## Regresión Polinómica

In [7]:
%%time
poly_pipe = Pipeline([
                        ('poly', PolynomialFeatures()),
                        ('scale', StandardScaler()),
                        ('lr', LinearRegression())
                     ])
grid = {'poly__degree':range(2,10)}

poly_reg = GridSearchCV(poly_pipe, param_grid=grid)
poly_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, poly_reg.predict(X_train))
r2_val = r2_score(y_val, poly_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, poly_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, poly_reg.predict(X_val))

mse_train = mean_squared_error(y_train, poly_reg.predict(X_train))
mse_val = mean_squared_error(y_val, poly_reg.predict(X_val))

max_error_train = max_error(y_train, poly_reg.predict(X_train))
max_error_val = max_error(y_val, poly_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.44449627110204504
R2 val 0.30213480806067083
MAE train 0.46652546111995125
MAE val 0.4989964673435877
MSE train 0.36311740342747495
MSE val 0.442919498919228
Max error train 2.2656921213732266
Max error val 1.853955968743798
Wall time: 6min 16s


In [9]:
poly_reg.best_params_

{'poly__degree': 2}

## Árbol de Decisión

In [12]:
%%time
tree = DecisionTreeRegressor()
dists = {'max_depth':range(2, 100),'min_samples_leaf': range(10, 1000)}

tree_reg = RandomizedSearchCV(tree, param_distributions=dists, n_iter=100)
tree_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, tree_reg.predict(X_train))
r2_val = r2_score(y_val, tree_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, tree_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, tree_reg.predict(X_val))

mse_train = mean_squared_error(y_train, tree_reg.predict(X_train))
mse_val = mean_squared_error(y_val, tree_reg.predict(X_val))

max_error_train = max_error(y_train, tree_reg.predict(X_train))
max_error_val = max_error(y_val, tree_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.434407585899638
R2 val 0.2419654340451889
MAE train 0.46172046754692075
MAE val 0.5232387175845787
MSE train 0.36971209754764384
MSE val 0.48110766089813606
Max error train 2.325581395348837
Max error val 2.475409836065574
Wall time: 2.77 s


## Máquina de Soporte Vectorial (SVM)

In [18]:
%%time
svm_pipe = Pipeline([
                        ('scale', StandardScaler()),
                        ('svr', SVR(kernel='rbf'))
                     ])
dists = {'svr__gamma':reciprocal(0.01, 100),'svr__C': reciprocal(1e-4, 1e4)}

svm_reg = RandomizedSearchCV(svm_pipe, param_distributions = dists, n_iter=100)
svm_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, svm_reg.predict(X_train))
r2_val = r2_score(y_val, svm_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, svm_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, svm_reg.predict(X_val))

mse_train = mean_squared_error(y_train, svm_reg.predict(X_train))
mse_val = mean_squared_error(y_val, svm_reg.predict(X_val))

max_error_train = max_error(y_train, svm_reg.predict(X_train))
max_error_val = max_error(y_val, svm_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.4771848361116148
R2 val 0.32831151039559714
MAE train 0.4184256112018784
MAE val 0.4680616597513442
MSE train 0.34174979375977166
MSE val 0.42630572878788814
Max error train 2.6644215678234575
Max error val 2.571421443520342
Wall time: 5min 32s


In [25]:
svm_reg.best_params_

{'svr__C': 0.7736260260189296, 'svr__gamma': 0.059420843195289834}

# Modelos de Ensamble

## Bagging de regresiones lineales

In [22]:
%%time
bagg_linear_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('bagg', BaggingRegressor(base_estimator=LinearRegression(), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_linear = GridSearchCV(bagg_linear_pipe, param_grid=grid)
bagg_linear.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_linear.predict(X_train))
r2_val = r2_score(y_val, bagg_linear.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_linear.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_linear.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_linear.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_linear.predict(X_val))

max_error_train = max_error(y_train, bagg_linear.predict(X_train))
max_error_val = max_error(y_val, bagg_linear.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.3606915742284076
R2 val 0.2858326847696282
MAE train 0.5011581423741397
MAE val 0.5000372601532085
MSE train 0.41789821288154066
MSE val 0.45326609359508885
Max error train 2.7234534569444167
Max error val 2.6925711827852385
Wall time: 1min 9s


## Bagging de regresion polinomial de grado 2

In [24]:
%%time
bagg_poly_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('poly', PolynomialFeatures(degree=2)),
                                ('bagg', BaggingRegressor(base_estimator=LinearRegression(), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_poly = GridSearchCV(bagg_poly_pipe, param_grid=grid)
bagg_poly.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_poly.predict(X_train))
r2_val = r2_score(y_val, bagg_poly.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_poly.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_poly.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_poly.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_poly.predict(X_val))

max_error_train = max_error(y_train, bagg_poly.predict(X_train))
max_error_val = max_error(y_val, bagg_poly.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.41761651197838634
R2 val 0.3138858311149133
MAE train 0.47545058577212695
MAE val 0.4991204835147734
MSE train 0.3806879575569719
MSE val 0.43546138623057296
Max error train 2.641899458263471
Max error val 1.8416436246000458
Wall time: 2min 51s


## Random Forest Regressor

In [19]:
%%time
forest = RandomForestRegressor()
dists = {'n_estimators':range(20, 100), 'max_depth':(2, 50), 'min_samples_leaf':range(2, 50)}

forest_reg = RandomizedSearchCV(forest, param_distributions=dists, n_iter=100)
forest_reg.fit(X_train, y_train)

r2_train = r2_score(y_train, forest_reg.predict(X_train))
r2_val = r2_score(y_val, forest_reg.predict(X_val))

mae_train = mean_absolute_error(y_train, forest_reg.predict(X_train))
mae_val = mean_absolute_error(y_val, forest_reg.predict(X_val))

mse_train = mean_squared_error(y_train, forest_reg.predict(X_train))
mse_val = mean_squared_error(y_val, forest_reg.predict(X_val))

max_error_train = max_error(y_train, forest_reg.predict(X_train))
max_error_val = max_error(y_val, forest_reg.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.812975611399041
R2 val 0.4180879637985334
MAE train 0.2498321952487378
MAE val 0.4421995768368535
MSE train 0.12225266336396916
MSE val 0.3693266127418898
Max error train 1.7340668590668589
Max error val 2.1903061224489786
Wall time: 1min 32s


## Bagging de SVM

In [26]:
%%time
bagg_svm_pipe = Pipeline([
                                ('scale', StandardScaler()),
                                ('bagg', BaggingRegressor(base_estimator=SVR(kernel='rbf', C=0.7, gamma=0.05), max_samples=2/3))
                            ])
grid = {'bagg__n_estimators':range(20, 100)}

bagg_svm = GridSearchCV(bagg_svm_pipe, param_grid=grid)
bagg_svm.fit(X_train, y_train)

r2_train = r2_score(y_train, bagg_svm.predict(X_train))
r2_val = r2_score(y_val, bagg_svm.predict(X_val))

mae_train = mean_absolute_error(y_train, bagg_svm.predict(X_train))
mae_val = mean_absolute_error(y_val, bagg_svm.predict(X_val))

mse_train = mean_squared_error(y_train, bagg_svm.predict(X_train))
mse_val = mean_squared_error(y_val, bagg_svm.predict(X_val))

max_error_train = max_error(y_train, bagg_svm.predict(X_train))
max_error_val = max_error(y_val, bagg_svm.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))


R2 train 0.44468592456319855
R2 val 0.32882700992181846
MAE train 0.4476415506728212
MAE val 0.47807355702484317
MSE train 0.3629934322121931
MSE val 0.42597855271651464
Max error train 2.537696596092637
Max error val 2.4317599275303152
Wall time: 5min 58s


## Gradient Boosting

In [28]:
%%time
boost_est = GradientBoostingRegressor()
dists = {'n_estimators':range(20, 100), 'max_depth':(2, 50), 'min_samples_leaf':range(2, 50)}

boost = RandomizedSearchCV(boost_est, param_distributions=dists, n_iter=100)
boost.fit(X_train, y_train)

r2_train = r2_score(y_train, boost.predict(X_train))
r2_val = r2_score(y_val, boost.predict(X_val))

mae_train = mean_absolute_error(y_train, boost.predict(X_train))
mae_val = mean_absolute_error(y_val, boost.predict(X_val))

mse_train = mean_squared_error(y_train, boost.predict(X_train))
mse_val = mean_squared_error(y_val, boost.predict(X_val))

max_error_train = max_error(y_train, boost.predict(X_train))
max_error_val = max_error(y_val, boost.predict(X_val))

print('R2 train {}'.format(r2_train))
print('R2 val {}'.format(r2_val))

print('MAE train {}'.format(mae_train))
print('MAE val {}'.format(mae_val))

print('MSE train {}'.format(mse_train))
print('MSE val {}'.format(mse_val))

print('Max error train {}'.format(max_error_train))
print('Max error val {}'.format(max_error_val))

R2 train 0.6021198835018473
R2 val 0.3476683543849415
MAE train 0.38866206607638665
MAE val 0.48606012017992767
MSE train 0.2600832132393672
MSE val 0.4140203709000808
Max error train 2.064441684451955
Max error val 1.9618633555423521
Wall time: 1min 51s
