## Tinh ch·ªânh (hyperparameter tuning)

### Import th∆∞ vi·ªán

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings("ignore")

### Load v√† x·ª≠ l√Ω d·ªØ li·ªáu 

In [2]:
train = pd.read_excel("../exps/data/train_feat.xlsx")

# Chia features & target
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Chuy·ªÉn c√°c c·ªôt object -> s·ªë b·∫±ng LabelEncoder
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

print("D·ªØ li·ªáu ƒë√£ encode xong. Dtypes:")
print(X.dtypes.value_counts())

# T√°ch train / validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}")

D·ªØ li·ªáu ƒë√£ encode xong. Dtypes:
int64      13
float64     2
Name: count, dtype: int64
Train: (569, 15), Validation: (143, 15)


### Thi·∫øt l·∫≠p RandomizedSearchCV (D√≤ th√¥)

In [3]:
param_dist = {
    'n_estimators': randint(100, 800),        # s·ªë l∆∞·ª£ng c√¢y
    'learning_rate': uniform(0.01, 0.2),      # t·ªëc ƒë·ªô h·ªçc
    'max_depth': randint(3, 10),              # ƒë·ªô s√¢u c·ªßa c√¢y
    'min_samples_split': randint(2, 10),      # min m·∫´u ƒë·ªÉ t√°ch n√∫t
    'min_samples_leaf': randint(1, 10),       # min m·∫´u t·∫°i l√°
    'subsample': uniform(0.6, 0.4),           # t·ªâ l·ªá m·∫´u ƒë∆∞·ª£c l·∫•y ng·∫´u nhi√™n cho m·ªói c√¢y
    'max_features': ['sqrt', 'log2', None]    # s·ªë l∆∞·ª£ng feature m·ªói c√¢y ƒë∆∞·ª£c x√©t
}

# Kh·ªüi t·∫°o m√¥ h√¨nh Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)

# Thi·∫øt l·∫≠p RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring="accuracy",
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Hu·∫•n luy·ªán
random_search.fit(X_train, y_train)

print("Best parameters (Random Search):")
print(random_search.best_params_)
print(f"Best CV Accuracy: {random_search.best_score_:.4f}")

best_random_params = random_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters (Random Search):
{'learning_rate': np.float64(0.03441759094013467), 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 476, 'subsample': np.float64(0.8590760482165449)}
Best CV Accuracy: 0.8137


### Fine-tuning quanh tham s·ªë t·ªët nh·∫•t

In [4]:
# T·∫°o l∆∞·ªõi tham s·ªë tinh ch·ªânh quanh k·∫øt qu·∫£ RandomizedSearchCV
param_grid_fine = {
    "max_depth": [
        max(1, best_random_params["max_depth"] - 1),
        best_random_params["max_depth"],
        best_random_params["max_depth"] + 1
    ],
    "learning_rate": [
        best_random_params["learning_rate"] * 0.8,
        best_random_params["learning_rate"],
        best_random_params["learning_rate"] * 1.2
    ],
    "subsample": [
        max(0.6, best_random_params["subsample"] - 0.1),
        best_random_params["subsample"],
        min(1.0, best_random_params["subsample"] + 0.1)
    ],
    "min_samples_split": [
        max(2, best_random_params["min_samples_split"] - 1),
        best_random_params["min_samples_split"],
        best_random_params["min_samples_split"] + 1
    ],
    "min_samples_leaf": [
        max(1, best_random_params["min_samples_leaf"] - 1),
        best_random_params["min_samples_leaf"],
        best_random_params["min_samples_leaf"] + 1
    ]
}

# Kh·ªüi t·∫°o m√¥ h√¨nh v·ªõi tham s·ªë c∆° b·∫£n t·ª´ best_random_params
fine_model = GradientBoostingClassifier(
    random_state=42,
    n_estimators=best_random_params["n_estimators"],
    max_features=best_random_params["max_features"]
)

# Tinh ch·ªânh b·∫±ng GridSearchCV
grid_search = GridSearchCV(
    estimator=fine_model,
    param_grid=param_grid_fine,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Hu·∫•n luy·ªán
grid_search.fit(X_train, y_train)

print("Best parameters (Fine-tuning):")
print(grid_search.best_params_)
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

best_params = grid_search.best_params_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters (Fine-tuning):
{'learning_rate': np.float64(0.03441759094013467), 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 3, 'subsample': np.float64(0.8590760482165449)}
Best CV Accuracy: 0.8155


### Th·ª≠ nghi·ªám tham s·ªë

In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import itertools

# C√°c gi√° tr·ªã mu·ªën th·ª≠ (em c√≥ th·ªÉ tinh ch·ªânh l·∫°i d·∫£i sau)
param_grid = {
    "learning_rate": [0.05, 0.03],
    "max_depth": [3, 4, 5],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 1.0],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

# Duy·ªát m·ªçi t·ªï h·ª£p
best_acc = 0
best_params_combo = None

for combo in itertools.product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    
    model = GradientBoostingClassifier(
        **params,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    
    if acc > best_acc:
        best_acc = acc
        best_params_combo = params
    
    print(f"{params} ‚Üí {acc:.4f}")

print("\nBest Validation Accuracy:", round(best_acc, 4))
print("Best Params:", best_params_combo)

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8, 'min_samples_split': 2, 'min_samples_leaf': 1} ‚Üí 0.8671
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8, 'min_samples_split': 2, 'min_samples_leaf': 2} ‚Üí 0.8462
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8, 'min_samples_split': 5, 'min_samples_leaf': 1} ‚Üí 0.8601
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8, 'min_samples_split': 5, 'min_samples_leaf': 2} ‚Üí 0.8462
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0, 'min_samples_split': 2, 'min_samples_leaf': 1} ‚Üí 0.8392
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0, 'min_samples_split': 2, 'min_samples_leaf': 2} ‚Üí 0.8531
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0, 'min_samples_split': 5, 'min_samples_leaf': 1} ‚Üí 0.8601
{'learning_rate': 0.05, 'max_depth': 3, '

### Hu·∫•n luy·ªán m√¥ h√¨nh cu·ªëi c√πng & ƒë√°nh gi√°

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# M√¥ h√¨nh Gradient Boosting t·ªët nh·∫•t t·ª´ qu√° tr√¨nh tuning
final_model = GradientBoostingClassifier(
    learning_rate=0.03,
    max_depth=4,
    n_estimators=300,
    subsample=1.0,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
final_model.fit(X_train, y_train)

# D·ª± ƒëo√°n & ƒë√°nh gi√° tr√™n t·∫≠p validation
y_pred = final_model.predict(X_val)
val_acc = accuracy_score(y_val, y_pred)

print(f"üèÜ Final Validation Accuracy: {val_acc:.4f}")

üèÜ Final Validation Accuracy: 0.8881


In [7]:
import joblib
joblib.dump(final_model, "Gradient_Boosting_best_final.pkl")
print("Model saved as Gradient_Boosting_best_final.pkl")

Model saved as Gradient_Boosting_best_final.pkl


In [8]:
import pandas as pd

test = pd.read_excel("../exps/data/test_feat.xlsx")
test_org = pd.read_csv("../../data/test.csv")

# X·ª≠ l√Ω test gi·ªëng train
full = pd.concat([train.drop("Survived", axis=1), test])
full = pd.get_dummies(full, drop_first=True)

X = full.iloc[:len(train), :]
X_test = full.iloc[len(train):, :]
y = train["Survived"]

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

final_model = GradientBoostingClassifier(
    learning_rate=0.03,
    max_depth=4,
    n_estimators=300,
    subsample=1.0,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
final_model.fit(X, y)

0,1,2
,loss,'log_loss'
,learning_rate,0.03
,n_estimators,300
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [10]:
test_pred = final_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_org["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("submission.csv", index=False)
print("File submission.csv ƒë√£ ƒë∆∞·ª£c t·∫°o!")

File submission.csv ƒë√£ ƒë∆∞·ª£c t·∫°o!
