# 03f: Model Evaluation และ Comparison

## วัตถุประสงค์การเรียนรู้
- เปรียบเทียบประสิทธิภาพของ ML models
- ใช้ Cross-validation อย่างถูกต้อง
- ทำ Hyperparameter tuning
- สร้าง Model evaluation pipeline

---

## 1. Model Comparison Framework

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# สร้างข้อมูลตัวอย่าง
from sklearn.datasets import make_classification, make_regression

# Classification data
X_class, y_class = make_classification(n_samples=1000, n_features=10, 
                                      n_informative=5, random_state=42)

# Regression data  
X_reg, y_reg = make_regression(n_samples=1000, n_features=10,
                              noise=0.1, random_state=42)

print("ข้อมูลตัวอย่างสำหรับ Model Comparison:")
print(f"Classification: {X_class.shape}, {len(np.unique(y_class))} classes")
print(f"Regression: {X_reg.shape}, target range: {y_reg.min():.2f} to {y_reg.max():.2f}")

## 2. Cross-Validation Comparison

In [None]:
# Classification Models
classification_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42),
    'k-NN': KNeighborsClassifier()
}

# Cross-validation สำหรับ Classification
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
classification_results = {}

print("Classification Model Comparison:")
print("=" * 50)

for name, model in classification_models.items():
    # ใช้ scaled data สำหรับ SVM และ k-NN
    if name in ['SVM', 'k-NN', 'Logistic Regression']:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_class)
        scores = cross_val_score(model, X_scaled, y_class, cv=cv, scoring='accuracy')
    else:
        scores = cross_val_score(model, X_class, y_class, cv=cv, scoring='accuracy')
    
    classification_results[name] = scores
    print(f"{name:20s}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# แสดงผลในรูปกราฟ
plt.figure(figsize=(12, 6))
plt.boxplot([classification_results[name] for name in classification_models.keys()],
           labels=list(classification_models.keys()))
plt.title('Classification Models - Cross-Validation Scores')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Regression Models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'SVR': SVR(),
    'k-NN': KNeighborsRegressor()
}

# Cross-validation สำหรับ Regression
from sklearn.model_selection import KFold
cv_reg = KFold(n_splits=5, shuffle=True, random_state=42)
regression_results = {}

print("\nRegression Model Comparison:")
print("=" * 50)

for name, model in regression_models.items():
    # ใช้ scaled data สำหรับ SVR และ k-NN
    if name in ['SVR', 'k-NN', 'Ridge', 'Lasso']:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_reg)
        scores = cross_val_score(model, X_scaled, y_reg, cv=cv_reg, scoring='r2')
    else:
        scores = cross_val_score(model, X_reg, y_reg, cv=cv_reg, scoring='r2')
    
    regression_results[name] = scores
    print(f"{name:20s}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# แสดงผลในรูปกราฟ
plt.figure(figsize=(12, 6))
plt.boxplot([regression_results[name] for name in regression_models.keys()],
           labels=list(regression_models.keys()))
plt.title('Regression Models - Cross-Validation Scores')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning สำหรับ Random Forest
from sklearn.model_selection import train_test_split

# แบ่งข้อมูล
X_train, X_test, y_train, y_test = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Grid Search สำหรับ Random Forest Classifier
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf_classifier, rf_params, cv=5, scoring='accuracy', 
    n_jobs=-1, verbose=1
)

print("กำลังทำ Hyperparameter Tuning สำหรับ Random Forest...")
grid_search.fit(X_train, y_train)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.3f}")

# ทดสอบกับ test set
best_rf = grid_search.best_estimator_
test_score = best_rf.score(X_test, y_test)
print(f"Test Score: {test_score:.3f}")

## 4. Model Performance Summary

In [None]:
# สรุปผลการเปรียบเทียบ

# Classification Summary
class_summary = pd.DataFrame({
    'Model': list(classification_results.keys()),
    'Mean_CV_Score': [scores.mean() for scores in classification_results.values()],
    'Std_CV_Score': [scores.std() for scores in classification_results.values()]
}).sort_values('Mean_CV_Score', ascending=False)

print("Classification Models Ranking:")
print(class_summary.round(3))

# Regression Summary
reg_summary = pd.DataFrame({
    'Model': list(regression_results.keys()),
    'Mean_CV_Score': [scores.mean() for scores in regression_results.values()],
    'Std_CV_Score': [scores.std() for scores in regression_results.values()]
}).sort_values('Mean_CV_Score', ascending=False)

print("\nRegression Models Ranking:")
print(reg_summary.round(3))

# แสดงผลเปรียบเทียบ
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Classification
axes[0].barh(class_summary['Model'], class_summary['Mean_CV_Score'])
axes[0].set_title('Classification Models Performance')
axes[0].set_xlabel('Accuracy')

# Regression
axes[1].barh(reg_summary['Model'], reg_summary['Mean_CV_Score'])
axes[1].set_title('Regression Models Performance')
axes[1].set_xlabel('R² Score')

plt.tight_layout()
plt.show()

## สรุป

### Classification Models
- Random Forest และ SVM มักให้ผลดี
- Logistic Regression เร็วและตีความได้
- Decision Tree อาจ overfit

### Regression Models  
- Random Forest ให้ผลดีกับข้อมูลส่วนใหญ่
- Linear models เร็วและเสถียร
- Regularization ช่วยลด overfitting

### Best Practices
- ใช้ Cross-validation เสมอ
- Scale data สำหรับ distance-based algorithms
- Tune hyperparameters อย่างระมัดระวัง

### Next: Best Practices และ Conclusion