In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

TEST_SIZE = 0.3
RANDOM_SEED = 42


In [14]:
data = pd.read_csv("./winequality-red.csv", sep=";")
data["good"] = data.quality >= 6
data.drop(["quality"], axis=1, inplace=True)

In [21]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [27]:
model = LogisticRegression()
model.fit(X_train, y_train.values.ravel())
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)
print('[train] F1-score = {:.3f}'.format(f1_train))
print('[test] F1-score = {:.3f}'.format(f1_test))

[train] F1-score = 0.766
[test] F1-score = 0.754


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
model = DecisionTreeClassifier(max_depth=10, random_state=RANDOM_SEED)
model.fit(X_train, y_train.values.ravel())
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)
print('[train] F1-score = {:.3f}'.format(f1_train))
print('[test] F1-score = {:.3f}'.format(f1_test))

[train] F1-score = 0.943
[test] F1-score = 0.793


In [31]:
bagg = BaggingClassifier(random_state=RANDOM_SEED, base_estimator=model, n_estimators=1500)

In [32]:
bagg.fit(X_train, y_train.values.ravel())

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=10,
                                                        random_state=42),
                  n_estimators=1500, random_state=42)

In [33]:
y_bagg_pred_train = bagg.predict(X_train)
y_bagg_pred_test = bagg.predict(X_test)

In [34]:
f1_train = f1_score(y_train, y_bagg_pred_train)
f1_test = f1_score(y_test, y_bagg_pred_test)
print('[train] F1-score = {:.3f}'.format(f1_train))
print('[test] F1-score = {:.3f}'.format(f1_test))

[train] F1-score = 0.990
[test] F1-score = 0.818
