In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.svm import SVC
import time  

In [2]:
column_names = [
    "FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides",
    "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "PH", "Sulphates", "Alcohol", "Quality"
]

df_wine = pd.read_csv(
    "winequality-white.dat",
    comment='@',
    header=None,  
    names=column_names,  
    skip_blank_lines=True  
)

df_wine

Unnamed: 0,FixedAcidity,VolatileAcidity,CitricAcid,ResidualSugar,Chlorides,FreeSulfurDioxide,TotalSulfurDioxide,Density,PH,Sulphates,Alcohol,Quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [3]:
X = df_wine.drop(columns="Quality")
y = df_wine.Quality

Выполним стандартизацию X

In [4]:
scaler = StandardScaler()

X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=X.columns)
X_std

Unnamed: 0,FixedAcidity,VolatileAcidity,CitricAcid,ResidualSugar,Chlorides,FreeSulfurDioxide,TotalSulfurDioxide,Density,PH,Sulphates,Alcohol
0,0.172097,-0.081770,0.213280,2.821349,-0.035355,0.569932,0.744565,2.331512,-1.246921,-0.349184,-1.393152
1,-0.657501,0.215896,0.048001,-0.944765,0.147747,-1.253019,-0.149685,-0.009154,0.740029,0.001342,-0.824276
2,1.475751,0.017452,0.543838,0.100282,0.193523,-0.312141,-0.973336,0.358665,0.475102,-0.436816,-0.336667
3,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.011480,-0.787342,-0.499203
4,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.011480,-0.787342,-0.499203
...,...,...,...,...,...,...,...,...,...,...,...
4893,-0.776015,-0.677101,-0.365197,-0.944765,-0.310008,-0.664970,-1.091000,-0.965483,0.541334,0.088973,0.557282
4894,-0.301959,0.414339,0.213280,0.317179,0.056196,1.275590,0.697499,0.291789,-0.253446,-0.261553,-0.743008
4895,-0.420473,-0.379435,-1.191592,-1.023637,-0.218457,-0.312141,-0.643875,-0.497350,-1.313153,-0.261553,-0.905544
4896,-1.605613,0.116674,-0.282557,-1.043355,-1.088192,-0.900190,-0.667408,-1.784717,1.004955,-0.962605,1.857572


In [5]:
X_train, X_tmp, y_train, y_tmp = train_test_split(X_std, y, test_size=0.5,stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.6, stratify=y_tmp, random_state=42)

Построим дерево решений

In [6]:
criterions = ["entropy", "gini"]
alphies = [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.2, 0.8]


max_accuracy = 0
max_precision = 0
max_recall = 0
max_auc = 0

param_accuracy = {"criterion" : None, "alpha" : None }
param_precision = {"criterion" : None, "alpha" : None }
param_recall = {"criterion" : None, "alpha" : None }
param_auc = {"criterion" : None, "alpha" : None }


for criterion in criterions:

    for alpha in alphies:
        
        clf = DecisionTreeClassifier(criterion=criterion, ccp_alpha=alpha) 
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_val)
        
        current_accuracy = metrics.accuracy_score(y_val, y_pred)
        if current_accuracy > max_accuracy:
            max_accuracy = current_accuracy
            param_accuracy["criterion"] = criterion
            param_accuracy["alpha"] = alpha
        
        current_precision = metrics.precision_score(y_val, y_pred, average="weighted", zero_division=0)
        if current_precision > max_precision:
            max_precision = current_precision
            param_precision["criterion"] = criterion
            param_precision["alpha"] = alpha
            
        current_recall = metrics.recall_score(y_val, y_pred, average="weighted")
        if current_recall > max_recall:
            max_recall = current_recall
            param_recall["criterion"] = criterion
            param_recall["alpha"] = alpha
            
        current_auc = metrics.roc_auc_score(y_val, clf.predict_proba(X_val), average="macro", multi_class="ovr")
        if current_auc > max_auc:
            max_auc = current_auc
            param_auc["criterion"] = criterion
            param_auc["alpha"] = alpha
            
df_scores = pd.DataFrame([param_accuracy, param_precision, param_recall, param_auc], \
                         index=["Accuracy", "Precision", "Recall", "ROC-AUC"])

df_scores["score_val"] = [max_accuracy, max_precision, max_recall, max_auc]
df_scores

Unnamed: 0,criterion,alpha,score_val
Accuracy,gini,0.005,0.533197
Precision,gini,0.005,0.501587
Recall,gini,0.005,0.533197
ROC-AUC,entropy,0.015,0.70047


Т.о. согласно метрикам Accuracy, Precision, Recall оптимальные параметры модели: criterion = entropy, alpha = 0.005

А согласно метрике ROC-AUC оптимальные параметры модели: criterion = gini, alpha = 0.800

Обучим SVM 

In [7]:
krls = ["linear", "poly", "rbf", "sigmoid"]
gammas = ["scale", "auto"]
coef0s = [0, 1, 2, 5]
degrees = [2, 3, 4, 5]
Cs = [ 0.1, 1, 10, 20]


svm_max_accuracy = 0
svm_max_precision = 0
svm_max_recall = 0
svm_max_auc = 0

svm_param_accuracy = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_precision = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_recall = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_auc = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}


start_time = time.time()
for krl in krls:
    
    for gamma in gammas:
        
        for coef0 in coef0s:
            
            for degree in degrees:
                
                for C in Cs:
                    
                    svm = SVC(kernel=krl, gamma=gamma, coef0=coef0, degree=degree, C=C, probability=True)
                    svm.fit(X_train, y_train)
        
                    y_pred = svm.predict(X_val)
        
                    svm_current_accuracy = metrics.accuracy_score(y_val, y_pred)
                    if svm_current_accuracy > svm_max_accuracy:
                        svm_max_accuracy = svm_current_accuracy
                        svm_param_accuracy["krl"] = krl
                        svm_param_accuracy["gamma"] = gamma
                        svm_param_accuracy["coef0"] = coef0
                        svm_param_accuracy["degree"] = degree
                        svm_param_accuracy["C"] = C
        
                    svm_current_precision = metrics.precision_score(y_val, y_pred, average="weighted", zero_division=0)
                    if svm_current_precision > svm_max_precision:
                        svm_max_precision = svm_current_precision
                        svm_param_precision["krl"] = krl
                        svm_param_precision["gamma"] = gamma
                        svm_param_precision["coef0"] = coef0
                        svm_param_precision["degree"] = degree
                        svm_param_precision["C"] = C
        
            
                    svm_current_recall = metrics.recall_score(y_val, y_pred, average="weighted")
                    if svm_current_recall > svm_max_recall:
                        svm_max_recall = svm_current_recall
                        svm_param_recall["krl"] = krl
                        svm_param_recall["gamma"] = gamma
                        svm_param_recall["coef0"] = coef0
                        svm_param_recall["degree"] = degree
                        svm_param_recall["C"] = C
        
            
                    svm_current_auc = metrics.roc_auc_score(y_val, svm.predict_proba(X_val), average="macro", multi_class="ovr")
                    if svm_current_auc > svm_max_auc:
                        svm_max_auc = svm_current_auc
                        svm_param_auc["krl"] = krl
                        svm_param_auc["gamma"] = gamma
                        svm_param_auc["coef0"] = coef0
                        svm_param_auc["degree"] = degree
                        svm_param_auc["C"] = C
        

end_time = time.time()


df_scores = pd.DataFrame([svm_param_accuracy, svm_param_precision, svm_param_recall, svm_param_auc], \
                         index=["Accuracy", "Precision", "Recall", "ROC-AUC"])
                    
print("Время обучения: ", end_time - start_time)

df_scores["value"] = [svm_max_accuracy, svm_max_precision, svm_max_recall, svm_max_auc]


Время обучения:  7911.348870277405


In [8]:
df_scores

Unnamed: 0,krl,gamma,coef0,degree,C,value
Accuracy,rbf,scale,0,2,1.0,0.556691
Precision,poly,scale,0,3,0.1,0.567818
Recall,rbf,scale,0,2,1.0,0.556691
ROC-AUC,poly,scale,5,2,1.0,0.842884


Проверим на тестовой выборке 4 модели: 

1) дерево решений с параметрами criterion = entropy, alpha = 0.005 

2. дерево решений с параметрами criterion = gini, alpha = 0.8

3. SVM c параметрами: krl = poly, gamma = scale, coef0 = 1, degree = 4, C = 0.1

4. SVM c параметрами: krl = sigmoid, gamma = auto, coef0 = 5, degree = 5, C = 20

In [9]:
model_1 = DecisionTreeClassifier(criterion="entropy", ccp_alpha=0.005)
model_1.fit(X_train, y_train)
y_pred_1 = model_1.predict(X_test)

model_2 = DecisionTreeClassifier(criterion="gini", ccp_alpha=0.8)
model_2.fit(X_train, y_train)
y_pred_2 = model_2.predict(X_test)

model_3 = SVC(kernel="poly", gamma="scale", coef0=1, degree=4, C=0.1, probability=True, )
model_3.fit(X_train, y_train)
y_pred_3 = model_3.predict(X_test)

model_4 = SVC(kernel="sigmoid", gamma="auto", coef0=5, degree=5, C=20, probability=True)
model_4.fit(X_train, y_train)
y_pred_4 = model_4.predict(X_test)

In [10]:
model_1_scores = {"Accuracy" : metrics.accuracy_score(y_test, y_pred_1), \
                  "Precision" : metrics.precision_score(y_test, y_pred_1, average="weighted"),\
                  "Recall" : metrics.recall_score(y_test, y_pred_1, average="weighted"), \
                  "ROC-AUC" : metrics.roc_auc_score(y_test, model_1.predict_proba(X_test), multi_class="ovr") }
model_1_scores

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.5414965986394558,
 'Precision': 0.5177236540853712,
 'Recall': 0.5414965986394558,
 'ROC-AUC': 0.6735254734654114}

In [11]:
model_2_scores = {"Accuracy" : metrics.accuracy_score(y_test, y_pred_2), \
                  "Precision" : metrics.precision_score(y_test, y_pred_2, average="weighted"),\
                  "Recall" : metrics.recall_score(y_test, y_pred_2, average="weighted"), \
                  "ROC-AUC" : metrics.roc_auc_score(y_test, model_2.predict_proba(X_test), multi_class="ovr") }
model_2_scores

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.4489795918367347,
 'Precision': 0.20158267388588086,
 'Recall': 0.4489795918367347,
 'ROC-AUC': 0.5}

In [12]:
model_3_scores = {"Accuracy" : metrics.accuracy_score(y_test, y_pred_3), \
                  "Precision" : metrics.precision_score(y_test, y_pred_3, average="weighted"),\
                  "Recall" : metrics.recall_score(y_test, y_pred_3, average="weighted"), \
                  "ROC-AUC" : metrics.roc_auc_score(y_test, model_3.predict_proba(X_test), multi_class="ovr") }
model_3_scores

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.564625850340136,
 'Precision': 0.5581065781773015,
 'Recall': 0.564625850340136,
 'ROC-AUC': 0.7153422935162943}

In [13]:
model_4_scores = {"Accuracy" : metrics.accuracy_score(y_test, y_pred_4), \
                  "Precision" : metrics.precision_score(y_test, y_pred_4, average="weighted"),\
                  "Recall" : metrics.recall_score(y_test, y_pred_4, average="weighted"), \
                  "ROC-AUC" : metrics.roc_auc_score(y_test, model_4.predict_proba(X_test), multi_class="ovr") }
model_4_scores

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.44761904761904764,
 'Precision': 0.34985336563001734,
 'Recall': 0.44761904761904764,
 'ROC-AUC': 0.5700192086989458}

Т.к. каждая модель не может предсказать один из классов, посмотрим какой каждая из моделей класс она не может предсказать: 

In [14]:
y.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [15]:
y_test.unique()

array([8, 7, 6, 4, 5, 3, 9], dtype=int64)

Всего у нас 7 классов 

In [16]:
np.unique(y_pred_1)

array([4, 5, 6, 7, 8], dtype=int64)

In [17]:
np.unique(y_pred_2)

array([6], dtype=int64)

In [18]:
np.unique(y_pred_3)

array([3, 4, 5, 6, 7, 8], dtype=int64)

In [19]:
np.unique(y_pred_4)

array([5, 6, 7, 8], dtype=int64)

Т.о. 1ая модель не может предсказать 2 класса: 9 и 3; 2ая модель предсказывает только оди класс; 3ья модель не предсказвает только 9ый класc; 4ая модель предсказвает только 3 класса

Лучшая модель - 3ья (SVM c параметрами: krl = poly, gamma = scale, coef0 = 1, degree = 4, C = 0.1) 

In [20]:
y.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: Quality, dtype: int64

Модель не предсказывает 9ый класс, т.к. в нем слишком мало объектов  

Воспользуемся насыщением данных увеличим количество объектов в 3 и 9 классах рандомным дублированием наблюдений

In [21]:
y_train.value_counts()

6    1099
5     728
7     440
8      88
4      82
3      10
9       2
Name: Quality, dtype: int64

In [22]:
from imblearn.over_sampling import RandomOverSampler


ros = RandomOverSampler(sampling_strategy={3 : 70, 9:70}, random_state=42)

X_train_over, y_train_over = ros.fit_resample(X_train, y_train)

In [23]:
y_train_over.value_counts()

6    1099
5     728
7     440
8      88
4      82
3      70
9      70
Name: Quality, dtype: int64

In [24]:
X_train = X_train_over
y_train = y_train_over

criterions = ["entropy", "gini"]
alphies = [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.2, 0.8]


max_accuracy = 0
max_precision = 0
max_recall = 0
max_auc = 0

param_accuracy = {"criterion" : None, "alpha" : None }
param_precision = {"criterion" : None, "alpha" : None }
param_recall = {"criterion" : None, "alpha" : None }
param_auc = {"criterion" : None, "alpha" : None }


for criterion in criterions:

    for alpha in alphies:
        
        clf = DecisionTreeClassifier(criterion=criterion, ccp_alpha=alpha) 
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_val)
        
        current_accuracy = metrics.accuracy_score(y_val, y_pred)
        if current_accuracy > max_accuracy:
            max_accuracy = current_accuracy
            param_accuracy["criterion"] = criterion
            param_accuracy["alpha"] = alpha
        
        current_precision = metrics.precision_score(y_val, y_pred, average="weighted", zero_division=0)
        if current_precision > max_precision:
            max_precision = current_precision
            param_precision["criterion"] = criterion
            param_precision["alpha"] = alpha
            
        current_recall = metrics.recall_score(y_val, y_pred, average="weighted")
        if current_recall > max_recall:
            max_recall = current_recall
            param_recall["criterion"] = criterion
            param_recall["alpha"] = alpha
            
        current_auc = metrics.roc_auc_score(y_val, clf.predict_proba(X_val), average="macro", multi_class="ovr")
        if current_auc > max_auc:
            max_auc = current_recall
            param_auc["criterion"] = criterion
            param_auc["alpha"] = alpha
            
df_scores = pd.DataFrame([param_accuracy, param_precision, param_recall, param_auc], \
                         index=["Accuracy", "Precision", "Recall", "ROC-AUC"])

df_scores["score_val"] = [max_accuracy, max_precision, max_recall, max_auc]
df_scores

Unnamed: 0,criterion,alpha,score_val
Accuracy,entropy,0.03,0.536261
Precision,entropy,0.005,0.512605
Recall,entropy,0.03,0.536261
ROC-AUC,gini,0.8,0.448417


In [25]:
X_train = X_train_over
y_train = y_train_over


krls = ["linear", "poly", "rbf", "sigmoid"]
gammas = ["scale", "auto"]
coef0s = [0, 1, 2, 5]
degrees = [2, 3, 4, 5]
Cs = [ 0.1, 1, 10, 20]


svm_max_accuracy = 0
svm_max_precision = 0
svm_max_recall = 0
svm_max_auc = 0

svm_param_accuracy = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_precision = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_recall = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}
svm_param_auc = {"krl" : None, "gamma" : None, "coef0" : None, "degree" : None, "C" : None}


start_time = time.time()
for krl in krls:
    
    for gamma in gammas:
        
        for coef0 in coef0s:
            
            for degree in degrees:
                
                for C in Cs:
                    
                    svm = SVC(kernel=krl, gamma=gamma, coef0=coef0, degree=degree, C=C, probability=True)
                    svm.fit(X_train, y_train)
        
                    y_pred = svm.predict(X_val)
        
                    svm_current_accuracy = metrics.accuracy_score(y_val, y_pred)
                    if svm_current_accuracy > svm_max_accuracy:
                        svm_max_accuracy = svm_current_accuracy
                        svm_param_accuracy["krl"] = krl
                        svm_param_accuracy["gamma"] = gamma
                        svm_param_accuracy["coef0"] = coef0
                        svm_param_accuracy["degree"] = degree
                        svm_param_accuracy["C"] = C
        
                    svm_current_precision = metrics.precision_score(y_val, y_pred, average="weighted", zero_division=0)
                    if svm_current_precision > svm_max_precision:
                        svm_max_precision = svm_current_precision
                        svm_param_precision["krl"] = krl
                        svm_param_precision["gamma"] = gamma
                        svm_param_precision["coef0"] = coef0
                        svm_param_precision["degree"] = degree
                        svm_param_precision["C"] = C
        
            
                    svm_current_recall = metrics.recall_score(y_val, y_pred, average="weighted")
                    if svm_current_recall > svm_max_recall:
                        svm_max_recall = svm_current_recall
                        svm_param_recall["krl"] = krl
                        svm_param_recall["gamma"] = gamma
                        svm_param_recall["coef0"] = coef0
                        svm_param_recall["degree"] = degree
                        svm_param_recall["C"] = C
        
            
                    svm_current_auc = metrics.roc_auc_score(y_val, svm.predict_proba(X_val), average="macro", multi_class="ovr")
                    if svm_current_auc > svm_max_auc:
                        svm_max_auc = svm_current_auc
                        svm_param_auc["krl"] = krl
                        svm_param_auc["gamma"] = gamma
                        svm_param_auc["coef0"] = coef0
                        svm_param_auc["degree"] = degree
                        svm_param_auc["C"] = C
        

end_time = time.time()


df_scores = pd.DataFrame([svm_param_accuracy, svm_param_precision, svm_param_recall, svm_param_auc], \
                         index=["Accuracy", "Precision", "Recall", "ROC-AUC"])
                    
print("Время обучения: ", end_time - start_time)


df_scores["value"] = [svm_max_accuracy, svm_max_precision, svm_max_recall, svm_max_auc]

df_scores

Время обучения:  7331.938737630844


Unnamed: 0,krl,gamma,coef0,degree,C,value
Accuracy,poly,scale,1,5,10.0,0.556691
Precision,poly,auto,0,3,0.1,0.565564
Recall,poly,scale,1,5,10.0,0.556691
ROC-AUC,rbf,scale,0,2,1.0,0.838435
