In [1]:
import pandas as pd
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

data_directory = Path('../data/processed')
try:
    df_final = pd.read_csv(data_directory / 'final_master_dataset_for_prediction.csv')
    print("‚úÖ T·∫£i d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω th√†nh c√¥ng!")
    print(f"K√≠ch th∆∞·ªõc d·ªØ li·ªáu: {df_final.shape}")
except FileNotFoundError as e:
    print(f"L·ªói: Kh√¥ng t√¨m th·∫•y file {e.filename}. Vui l√≤ng ch·∫°y notebook data-exploration tr∆∞·ªõc.")
    exit()

‚úÖ T·∫£i d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω th√†nh c√¥ng!
K√≠ch th∆∞·ªõc d·ªØ li·ªáu: (3335, 31)


In [2]:
df_final = pd.get_dummies(df_final, columns=['Team_Tier'], prefix='TeamTier', drop_first=False)

In [3]:
# --- 2. T√°ch Features (X) v√† Target (y) ---
# X√°c ƒë·ªãnh c·ªôt m·ª•c ti√™u (target)
TARGET = 'FinalPosition'

COLS_TO_DROP = [
    TARGET,
    'Year',
    'RaceName',
    'DriverNumber',
    'EventDate',
    'Points',
    'RoundNumber',
    'Status',
    'TeamName'
]

# T·∫°o X b·∫±ng c√°ch lo·∫°i b·ªè c√°c c·ªôt kh√¥ng c·∫ßn thi·∫øt
# D√πng errors='ignore' ƒë·ªÉ code kh√¥ng b·ªã l·ªói n·∫øu m·ªôt c·ªôt trong list kh√¥ng t·ªìn t·∫°i
X = df_final.drop(columns=COLS_TO_DROP, errors='ignore')

# T·∫°o y t·ª´ c·ªôt target
y = df_final[TARGET]

# --- 3. Ph√¢n chia d·ªØ li·ªáu theo th·ªùi gian ---
# X√°c ƒë·ªãnh nƒÉm cu·ªëi c√πng l√†m d·ªØ li·ªáu ki·ªÉm tra (test)
test_year = df_final['Year'].max()
print(f"NƒÉm ƒë∆∞·ª£c ch·ªçn l√†m b·ªô d·ªØ li·ªáu test: {int(test_year)}")

# T·∫°o ƒëi·ªÅu ki·ªán l·ªçc (boolean mask) cho t·∫≠p train v√† test
train_mask = (df_final['Year'] < test_year)
test_mask = (df_final['Year'] == test_year)

# √Åp d·ª•ng ƒëi·ªÅu ki·ªán l·ªçc ƒë·ªÉ t·∫°o c√°c t·∫≠p d·ªØ li·ªáu
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]


print(f"Hu·∫•n luy·ªán tr√™n {X_train.shape[0]} m·∫´u, bao g·ªìm {X_train.shape[1]} features.")
print(f"Ki·ªÉm tra tr√™n {X_test.shape[0]} m·∫´u, bao g·ªìm {X_test.shape[1]} features.")

NƒÉm ƒë∆∞·ª£c ch·ªçn l√†m b·ªô d·ªØ li·ªáu test: 2025
Hu·∫•n luy·ªán tr√™n 2976 m·∫´u, bao g·ªìm 24 features.
Ki·ªÉm tra tr√™n 359 m·∫´u, bao g·ªìm 24 features.


In [4]:
# =============================================================================
# SECTION 3: HU·∫§N LUY·ªÜN V√Ä ƒê√ÅNH GI√Å M√î H√åNH
# =============================================================================
model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    early_stopping_rounds=50,
    random_state=42
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# ƒê√°nh gi√° m√¥ h√¨nh
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t! Sai s·ªë d·ª± ƒëo√°n trung b√¨nh (MAE): {mae:.2f} v·ªã tr√≠")

‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t! Sai s·ªë d·ª± ƒëo√°n trung b√¨nh (MAE): 3.23 v·ªã tr√≠


In [5]:
import optuna

def objective(trial):
    """
    H√†m m·ª•c ti√™u: th·ª≠ c√°c hyperparameters kh√°c nhau v√† return MAE
    """
    # ƒê·ªãnh nghƒ©a search space
    params = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'early_stopping_rounds': 50,  # ‚úÖ ƒê∆∞a v√†o constructor thay v√¨ fit()
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 3),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 10),
    }

    # Train model - early_stopping_rounds ƒë√£ c√≥ trong params
    model = XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    # T√≠nh MAE
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)

    return mae

print("‚úÖ Objective function ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")

‚úÖ Objective function ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!


In [6]:
print("üîç B·∫Øt ƒë·∫ßu t√¨m ki·∫øm hyperparameters t·ªët nh·∫•t...")
print("‚è±Ô∏è  Qu√° tr√¨nh n√†y c√≥ th·ªÉ m·∫•t 10-30 ph√∫t t√πy thu·ªôc v√†o n_trials")
print("=" * 70)

# T·∫°o study
study = optuna.create_study(
    direction='minimize',
    study_name='xgboost_f1_optimization',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Ch·∫°y optimization
study.optimize(
    objective,
    n_trials=500,  # Th·ª≠ 100 l·∫ßn (c√≥ th·ªÉ tƒÉng l√™n 200-500)
    timeout=1800,   # 30 ph√∫t timeout
    show_progress_bar=True
)

print("\n‚úÖ Ho√†n th√†nh!")
print("=" * 70)
print(f"\nüéØ Best MAE: {study.best_value:.4f}")
print(f"\nüìã Best Hyperparameters:")
for param, value in study.best_params.items():
    print(f"   ‚Ä¢ {param:20s}: {value}")

[I 2025-10-15 22:37:34,508] A new study created in memory with name: xgboost_f1_optimization


üîç B·∫Øt ƒë·∫ßu t√¨m ki·∫øm hyperparameters t·ªët nh·∫•t...
‚è±Ô∏è  Qu√° tr√¨nh n√†y c√≥ th·ªÉ m·∫•t 10-30 ph√∫t t√πy thu·ªôc v√†o n_trials


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2025-10-15 22:37:34,654] Trial 0 finished with value: 3.383356586140178 and parameters: {'n_estimators': 812, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 0.17425083650459838, 'reg_alpha': 4.330880728874676, 'reg_lambda': 6.41003510568888}. Best is trial 0 with value: 3.383356586140178.
[I 2025-10-15 22:37:35,389] Trial 1 finished with value: 3.2898211704987337 and parameters: {'n_estimators': 1446, 'learning_rate': 0.010725209743171996, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.6849356442713105, 'colsample_bytree': 0.6727299868828402, 'gamma': 0.5502135295603015, 'reg_alpha': 1.5212112147976886, 'reg_lambda': 5.72280788469014}. Best is trial 1 with value: 3.2898211704987337.
[I 2025-10-15 22:37:35,701] Trial 2 finished with value: 3.2463031180389743 and parameters: {'n_estimators': 921, 'learning_rate': 0.02692655251486473, 'max_depth': 7, 'min_child_weight':

In [15]:
print("üöÄ Training model v·ªõi hyperparameters t·ªët nh·∫•t...\n")

# T·∫°o params dict
best_params = study.best_params.copy()
best_params['objective'] = 'reg:squarederror'
best_params['random_state'] = 42
best_params['early_stopping_rounds'] = 50  # ‚úÖ Th√™m v√†o params

# Train - early_stopping_rounds ƒë√£ c√≥ trong best_params
final_model = XGBRegressor(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Evaluate
final_predictions = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, final_predictions)

print("‚úÖ Model ƒë√£ ƒë∆∞·ª£c train xong!")
print(final_mae)

üöÄ Training model v·ªõi hyperparameters t·ªët nh·∫•t...

‚úÖ Model ƒë√£ ƒë∆∞·ª£c train xong!
3.0630652263636042


In [18]:
print("B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán m√¥ h√¨nh v·ªõi b·ªô feature ƒë·∫ßy ƒë·ªß...")

# 1. Kh·ªüi t·∫°o m√¥ h√¨nh v·ªõi c√°c tham s·ªë t·ªët nh·∫•t ƒë√£ t√¨m ƒë∆∞·ª£c
# ƒê·∫£m b·∫£o c√°c tham s·ªë n√†y ƒë∆∞·ª£c t·ªëi ∆∞u tr√™n b·ªô feature ƒë·∫ßy ƒë·ªß

final_model = XGBRegressor(**best_params)

# 2. Hu·∫•n luy·ªán m√¥ h√¨nh tr√™n b·ªô d·ªØ li·ªáu ƒë·∫ßy ƒë·ªß (X_train, kh√¥ng ph·∫£i X_train_selected)
final_model.fit(X_train, y_train,
                eval_set=[(X_test, y_test)],
                verbose=False)

# 3. ƒê√°nh gi√° k·∫øt qu·∫£
final_predictions = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, final_predictions)

print("\n----------------------------------------------------")
print(f"‚úÖ HO√ÄN T·∫§T!")
print(f"   => MAE v·ªõi b·ªô feature ƒë·∫ßy ƒë·ªß: {final_mae:.4f} v·ªã tr√≠")
print("----------------------------------------------------")

B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán m√¥ h√¨nh v·ªõi b·ªô feature ƒë·∫ßy ƒë·ªß...

----------------------------------------------------
‚úÖ HO√ÄN T·∫§T!
   => MAE v·ªõi b·ªô feature ƒë·∫ßy ƒë·ªß: 3.0631 v·ªã tr√≠
----------------------------------------------------


In [19]:
import joblib
# --- L∆ØU MODEL V√Ä DANH S√ÅCH FEATURES ---

# 1. ƒê·ªãnh nghƒ©a t√™n file ƒë·ªÉ l∆∞u
model_filename = '../models/f1_prediction_model.joblib'
features_filename = '../models/f1_model_features.joblib'

# 2. L∆∞u ƒë·ªëi t∆∞·ª£ng model ƒë√£ hu·∫•n luy·ªán
joblib.dump(final_model, model_filename)

# 3. L·∫•y ra danh s√°ch t√™n c√°c feature v√† l∆∞u l·∫°i
feature_list = X_train.columns.tolist()
joblib.dump(feature_list, features_filename)

print(f"‚úÖ Model ƒë√£ ƒë∆∞·ª£c l∆∞u th√†nh c√¥ng v√†o file: '{model_filename}'")
print(f"‚úÖ Danh s√°ch {len(feature_list)} features ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o file: '{features_filename}'")

‚úÖ Model ƒë√£ ƒë∆∞·ª£c l∆∞u th√†nh c√¥ng v√†o file: 'f1_prediction_model.joblib'
‚úÖ Danh s√°ch 24 features ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o file: 'f1_model_features.joblib'
