In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://raw.githubusercontent.com/lenhattung/SP26-AI1904-DAP391m/refs/heads/main/lab_part_2/01%20-%20Linear%20and%20logistic%20regression/wines.csv'
df = pd.read_csv(url)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         6497 non-null   float64
 1   volatile_acidity      6497 non-null   float64
 2   citric_acid           6497 non-null   float64
 3   residual_sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free_sulfur_dioxide   6497 non-null   float64
 6   total_sulfur_dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  type                  6497 non-null   object 
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


In [4]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,type,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,red,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,red,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,red,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,red,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,red,5


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [6]:
X = df.drop(['quality', 'type'], axis=1)
y = df['quality']

print("Kích thước dữ liệu:", X.shape)
print("Các nhãn lớp (quality):", np.unique(y))

Kích thước dữ liệu: (6497, 11)
Các nhãn lớp (quality): [3 4 5 6 7 8 9]


In [7]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf')),
    "Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
}

In [8]:
# 1. 5-Fold CV tiêu chuẩn
cv_5fold = 5

# 2. Repeated K-Fold (Kiểm tra độ ổn định - Stability)
# Lặp lại 5-Fold 10 lần với random_state cố định để tái lập kết quả
cv_repeated = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

# 3. Repeated Stratified K-Fold (So sánh hiệu quả phân tầng)
cv_stratified = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

In [9]:
results_data = []

for name, model in models.items():
    print(f"Đang chạy model: {name}...")
    
    # --- A. Chạy 5-Fold thường ---
    scores_5 = cross_validate(model, X, y, cv=cv_5fold, return_train_score=True, scoring='accuracy')
    
    # --- B. Chạy Repeated K-Fold (Stability) ---
    scores_rep = cross_validate(model, X, y, cv=cv_repeated, return_train_score=True, scoring='accuracy')
    
    # --- C. Chạy Repeated Stratified K-Fold (Stratification) ---
    scores_strat = cross_validate(model, X, y, cv=cv_stratified, return_train_score=True, scoring='accuracy')

    # Lưu kết quả lại để phân tích ở cell sau
    results_data.append({
        'model': name,
        'cv_5_test': scores_5['test_score'],
        'cv_rep_test': scores_rep['test_score'],
        'cv_strat_test': scores_strat['test_score'],
        'cv_strat_train': scores_strat['train_score'] # Lưu train score của stratified để check overfit
    })

print("Hoàn tất!")

Đang chạy model: Random Forest...
Đang chạy model: SVM...
Đang chạy model: Logistic Regression...
Hoàn tất!


In [10]:
print(f"{'Model':<20} | {'Strategy':<20} | {'Mean Acc':<10} | {'Std Dev':<10}")
print("-" * 70)

for res in results_data:
    name = res['model']
    
    # Kết quả Repeated thường
    mean_rep = res['cv_rep_test'].mean()
    std_rep = res['cv_rep_test'].std()
    print(f"{name:<20} | Repeated (No Strat)  | {mean_rep:.4f}     | {std_rep:.4f}")
    
    # Kết quả Stratified
    mean_strat = res['cv_strat_test'].mean()
    std_strat = res['cv_strat_test'].std()
    print(f"{name:<20} | Stratified           | {mean_strat:.4f}     | {std_strat:.4f}")
    
    print("-" * 70)

Model                | Strategy             | Mean Acc   | Std Dev   
----------------------------------------------------------------------
Random Forest        | Repeated (No Strat)  | 0.6871     | 0.0112
Random Forest        | Stratified           | 0.6894     | 0.0102
----------------------------------------------------------------------
SVM                  | Repeated (No Strat)  | 0.5725     | 0.0120
SVM                  | Stratified           | 0.5727     | 0.0104
----------------------------------------------------------------------
Logistic Regression  | Repeated (No Strat)  | 0.5428     | 0.0094
Logistic Regression  | Stratified           | 0.5439     | 0.0104
----------------------------------------------------------------------


In [11]:
print(f"{'Model':<20} | {'Train Acc':<10} | {'Test Acc':<10} | {'Gap (Train-Test)':<15}")
print("-" * 65)

for res in results_data:
    name = res['model']
    
    # Lấy điểm trung bình từ Stratified CV
    train_acc = res['cv_strat_train'].mean()
    test_acc = res['cv_strat_test'].mean()
    gap = train_acc - test_acc
    
    status = "Overfit nặng" if gap > 0.1 else "Ổn định"
    
    print(f"{name:<20} | {train_acc:.4f}     | {test_acc:.4f}     | {gap:.4f} ({status})")

Model                | Train Acc  | Test Acc   | Gap (Train-Test)
-----------------------------------------------------------------
Random Forest        | 1.0000     | 0.6894     | 0.3106 (Overfit nặng)
SVM                  | 0.6068     | 0.5727     | 0.0341 (Ổn định)
Logistic Regression  | 0.5466     | 0.5439     | 0.0027 (Ổn định)
