1. Utwórz zbiór danych do klasyfikacji (lb_wierszy >=1000, lb_cech >= 10)

In [60]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split

# fetch dataset
wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
X = wine_quality.data.features
Y = wine_quality.data.targets.values.ravel()

print('Liczba wierszy:',len(wine_quality.data.features))
print('Liczba cech:', len(wine_quality.data.features.columns))


Liczba wierszy: 6497
Liczba cech: 11


2. Podziel zbiór na zbiór testowy i treningowy (40%, 60%)

In [61]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

3. Trenuj algorytmy :
* GaussianNB
* SVC (możliwe wartości C [1e-02, 1e-01, 1e00, 1e01, 1e02], jądro RBF)
* RandomForestClassifier (możliwe n estymatorów wartości [10, 100, 1000]

4. Oceń działanie algorytmów:
* Dokładność (Accuracy)
* F1-score
* AUC ROC


In [63]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

results = []

# GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
y_proba = gnb.predict_proba(x_test)
results.append({
    "Model": "GaussianNB",
    "Params": "-",
    "Accuracy": accuracy_score(y_test, y_pred),
    "F1_macro": f1_score(y_test, y_pred, average="macro"),
    "ROC_AUC_macro": roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
})

# SVC
for C_val in [1e-02, 1e-01, 1e00, 1e01, 1e02]:
    svc = SVC(C=C_val, kernel="rbf", probability=True, random_state=42)
    svc.fit(x_train, y_train)
    y_pred = svc.predict(x_test)
    y_proba = svc.predict_proba(x_test)
    results.append({
        "Model": "SVC",
        "Params": f"C={C_val}",
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1_macro": f1_score(y_test, y_pred, average="macro"),
        "ROC_AUC_macro": roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    })

# RandomForest
for n_est in [10, 100, 1000]:
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=14, random_state=42)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    y_proba = rf.predict_proba(x_test)
    results.append({
        "Model": "RandomForest",
        "Params": f"n={n_est}",
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1_macro": f1_score(y_test, y_pred, average="macro"),
        "ROC_AUC_macro": roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    })

results_df = pd.DataFrame(results).round(4)
results_df

Unnamed: 0,Model,Params,Accuracy,F1_macro,ROC_AUC_macro
0,GaussianNB,-,0.4559,0.2423,0.7501
1,SVC,C=0.01,0.4556,0.0894,0.6401
2,SVC,C=0.1,0.4525,0.1137,0.6858
3,SVC,C=1.0,0.4475,0.1237,0.6794
4,SVC,C=10.0,0.486,0.1562,0.7092
5,SVC,C=100.0,0.5144,0.1874,0.7299
6,RandomForest,n=10,0.6125,0.3235,0.7037
7,RandomForest,n=100,0.6426,0.3458,0.8382
8,RandomForest,n=1000,0.6449,0.3367,0.8646
