In [26]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from scipy.optimize import minimize

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore")


In [15]:
df = pd.read_csv("df.csv")


In [16]:
numerical_features = [
    'x_llcpwt','x_age80','x_bmi5','weight2','wtkg3','height3',
    'children','hhadult','alcday5','menthlth','physhlth'
]

categorical_features = [
    'smoke100','x_bmi5cat','genhlth','x_age_g','x_asthms1','x_ltasth1','x_casthm1',
    'x_totinda','exerany2','x_ment14d','x_phys14d','sex1','x_imprace','x_prace1','x_racegr3',
    'x_educag','educa','marital','x_incomg','x_metstat','x_urbstat','employ1','x_state',
    'persdoc2','checkup1','cvdinfr4','cvdcrhd4','cvdstrk3','asthma3','chccopd1','havarth3','addepev2',
    'chckdny1','diabete3','diffwalk','pneuvac4','x_rfhlth','x_rfsmok3','x_ageg5yr'
]
categorical_features.remove("smoke100")
categorical_features.remove("x_rfsmok3")
numerical_features.remove("x_llcpwt")

features = numerical_features + categorical_features


In [17]:
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), numerical_features),
                  ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])


## روش آنسامبل زمانی (Temporal Ensemble)

در این روش، ما برای هر سال یک مدل جداگانه آموزش می‌دهیم و سپس پیش‌بینی‌هایشان را با هم ترکیب می‌کنیم.

In [18]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df['x_smoker3'])
X_test = test_df[features]
y_test_raw = test_df['x_smoker3']
le = LabelEncoder().fit(train_df['x_smoker3']) 
y_test = le.transform(y_test_raw)


In [19]:
preprocessor_fitted = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)
preprocessor_fitted.fit(train_df[features])


trained_temporal_models = {}
available_years = sorted(train_df['year'].unique())

for year in available_years:
    print(f"model train for {year}")
    
    df_year_train = train_df.loc[train_df['year'] == year].copy()
    if df_year_train.empty:
        continue
    
    X_year_train = df_year_train[features]
    y_year_train = le.transform(df_year_train['x_smoker3'])
    
    model = lgb.LGBMClassifier(random_state=42, force_col_wise=True)
    
    X_year_train_transformed = preprocessor_fitted.transform(X_year_train)
    model.fit(X_year_train_transformed, y_year_train)
    trained_temporal_models[year] = model
print("\n models train completed!")


model train for 2018
[LightGBM] [Info] Number of positive: 88887, number of negative: 45761
[LightGBM] [Info] Total Bins 1496
[LightGBM] [Info] Number of data points in the train set: 134648, number of used features: 231
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.660143 -> initscore=0.663934
[LightGBM] [Info] Start training from score 0.663934
model train for 2019
[LightGBM] [Info] Number of positive: 83790, number of negative: 42383
[LightGBM] [Info] Total Bins 1483
[LightGBM] [Info] Number of data points in the train set: 126173, number of used features: 228
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.664088 -> initscore=0.681566
[LightGBM] [Info] Start training from score 0.681566
model train for 2020
[LightGBM] [Info] Number of positive: 78050, number of negative: 39341
[LightGBM] [Info] Total Bins 1491
[LightGBM] [Info] Number of data points in the train set: 117391, number of used features: 229
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.664872 -> initscore=0.6

In [20]:
# evaluate
print("combine and predict Temporal Ensemble")
X_test_transformed = preprocessor_fitted.transform(X_test)

predictions_per_model = {}
for year, model in trained_temporal_models.items():
    predictions_per_model[year] = model.predict_proba(X_test_transformed)[:, 1]

predictions_df = pd.DataFrame(predictions_per_model)


# method 1 : Temporal Ensemble with sample mean
print("\nresult of Temporal Ensemble with mean:")
y_pred_proba_simple = predictions_df.mean(axis=1)

y_pred_class_simple = (y_pred_proba_simple > 0.5).astype(int)

auc_simple = roc_auc_score(y_test, y_pred_proba_simple)
accuracy_simple = accuracy_score(y_test, y_pred_class_simple)
f1_simple = f1_score(y_test, y_pred_class_simple, average='weighted')

print(f"AUC: {auc_simple:.4f}")
print(f"Accuracy: {accuracy_simple:.4f}")
print(f"F1: {f1_simple:.4f}")


# method 2 : Temporal Ensemble with weights mean
print("\nresult of Temporal Ensemble with weighted")

# weight : 2018,2019,2020,2021,2022,2023
weights_temporal = np.array([0.05, 0.1, 0.1, 0.2, 0.25, 0.3])
y_pred_proba_weighted = np.average(predictions_df, axis=1, weights=weights_temporal)

y_pred_class_weighted = (y_pred_proba_weighted > 0.5).astype(int)

auc_weighted = roc_auc_score(y_test, y_pred_proba_weighted)
accuracy_weighted = accuracy_score(y_test, y_pred_class_weighted)
f1_weighted = f1_score(y_test, y_pred_class_weighted, average='weighted')

print(f"امتیاز AUC: {auc_weighted:.4f}")
print(f"دقت (Accuracy): {accuracy_weighted:.4f}")
print(f"امتیاز F1 (Weighted): {f1_weighted:.4f}")


# compare result of Temporal Ensemble
comparison_data = {
    'Method': ['Simple Average Ensemble', 'Weighted Average Ensemble'],
    'AUC': [auc_simple, auc_weighted],
    'Accuracy': [accuracy_simple, accuracy_weighted],
    'F1-Score': [f1_simple, f1_weighted]
}
comparison_df = pd.DataFrame(comparison_data).set_index('Method')
print("\ncompare Temporal Ensemble with sample/weighted mean:")
print(comparison_df)


combine and predict Temporal Ensemble

result of Temporal Ensemble with mean:
AUC: 0.7852
Accuracy: 0.7435
F1: 0.7316

result of Temporal Ensemble with weighted
امتیاز AUC: 0.7855
دقت (Accuracy): 0.7435
امتیاز F1 (Weighted): 0.7300

compare Temporal Ensemble with sample/weighted mean:
                                AUC  Accuracy  F1-Score
Method                                                 
Simple Average Ensemble    0.785189  0.743469  0.731618
Weighted Average Ensemble  0.785549  0.743481  0.729952


In [29]:
train_sub_df, validation_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['x_smoker3'])

X_val = validation_df[features]
y_val_raw = validation_df['x_smoker3']
y_val = le.transform(y_val_raw)

# --- گام ۱: دریافت پیش‌بینی‌ها روی داده‌های اعتبارسنجی ---
X_val_transformed = preprocessor_fitted.transform(X_val)
predictions_per_model_val = {}
for year, model in trained_temporal_models.items():
    predictions_per_model_val[year] = model.predict_proba(X_val_transformed)[:, 1]

predictions_df_val = pd.DataFrame(predictions_per_model_val)

# --- گام ۲: تعریف تابع هدف، محدودیت‌ها و اجرای بهینه‌ساز ---

def objective_function(weights, predictions, true_labels):
    """تابع هدف که 1 منهای AUC را برمی‌گرداند تا کمینه شود."""
    y_pred_ensemble = np.average(predictions, axis=1, weights=weights)
    auc = roc_auc_score(true_labels, y_pred_ensemble)
    return 1 - auc

num_models = len(available_years)
bounds = tuple([(0, 1) for _ in range(num_models)]) # هر وزن بین 0 و 1
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1}) # جمع وزن‌ها برابر با 1
initial_guess = np.array([1/num_models] * num_models) # حدس اولیه با وزن‌های برابر

optimization_result = minimize(
    fun=objective_function,
    x0=initial_guess,
    args=(predictions_df_val, y_val), # ارسال داده‌ها به تابع هدف
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

# --- گام ۳: استخراج و نمایش نتایج بهینه‌سازی ---
if optimization_result.success:
    optimized_weights = optimization_result.x
    best_auc_on_val = 1 - optimization_result.fun
    print("بهینه‌سازی موفق بود.")
    print(f"بهترین وزن‌های پیدا شده: {np.round(optimized_weights, 3)}")
    print(f"بهترین امتیاز AUC روی مجموعه اعتبارسنجی: {best_auc_on_val:.4f}")
else:
    print("بهینه‌سازی با شکست مواجه شد. از وزن‌های پیش‌فرض استفاده می‌شود.")
    print(optimization_result.message)
    optimized_weights = initial_guess # در صورت شکست، از وزن‌های برابر استفاده کن


# --- گام ۴: ارزیابی نهایی با وزن‌های بهینه شده روی مجموعه تست (Test Set) ---
y_pred_proba_optimized = np.average(predictions_df, axis=1, weights=optimized_weights)
y_pred_class_optimized = (y_pred_proba_optimized > 0.5).astype(int)

auc_optimized = roc_auc_score(y_test, y_pred_proba_optimized)
accuracy_optimized = accuracy_score(y_test, y_pred_class_optimized)
f1_optimized = f1_score(y_test, y_pred_class_optimized, average='weighted')

print(f"امتیاز AUC: {auc_optimized:.4f}")
print(f"دقت (Accuracy): {accuracy_optimized:.4f}")
print(f"امتیاز F1 (Weighted): {f1_optimized:.4f}")



بهینه‌سازی موفق بود.
بهترین وزن‌های پیدا شده: [0.167 0.167 0.167 0.167 0.167 0.167]
بهترین امتیاز AUC روی مجموعه اعتبارسنجی: 0.7882
امتیاز AUC: 0.7852
دقت (Accuracy): 0.7435
امتیاز F1 (Weighted): 0.7316


In [30]:
comparison_data_full = {
    'Method': ['Simple Average Ensemble', 'Hardcoded Weighted Ensemble', 'Optimized Weighted Ensemble'],
    'AUC': [auc_simple, auc_weighted, auc_optimized],
    'Accuracy': [accuracy_simple, accuracy_weighted, accuracy_optimized],
    'F1-Score': [f1_simple, f1_weighted, f1_optimized]
}
comparison_df_full = pd.DataFrame(comparison_data_full).set_index('Method')
print("compare Temporal Ensemble with sample/weighted mean & optimized weights:")
print(comparison_df_full.sort_values(by='AUC', ascending=False))


compare Temporal Ensemble with sample/weighted mean & optimized weights:
                                  AUC  Accuracy  F1-Score
Method                                                   
Hardcoded Weighted Ensemble  0.785549  0.743481  0.729952
Simple Average Ensemble      0.785189  0.743469  0.731618
Optimized Weighted Ensemble  0.785189  0.743469  0.731618


# روش انباشت Stacking (Stacking Ensemble) 

In [21]:
X_train_for_stacking = train_df[features]
y_train_for_stacking = le.transform(train_df['x_smoker3'])

base_models = [('lgbm', lgb.LGBMClassifier(random_state=42, force_col_wise=True)),
               ('xgb', xgb.XGBClassifier(random_state=42, eval_metric='logloss'))]

stacked_pipelines = []
for name, model in base_models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    stacked_pipelines.append((name, pipeline))

meta_model = LogisticRegression()
stacking_model = StackingClassifier(estimators=stacked_pipelines, final_estimator=meta_model, cv=5)

stacking_model.fit(X_train_for_stacking, y_train_for_stacking)
print("Stacking model trained!")


[LightGBM] [Info] Number of positive: 505894, number of negative: 239326
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 745220, number of used features: 237
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.678852 -> initscore=0.748501
[LightGBM] [Info] Start training from score 0.748501
[LightGBM] [Info] Number of positive: 404715, number of negative: 191461
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 596176, number of used features: 237
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.678852 -> initscore=0.748499
[LightGBM] [Info] Start training from score 0.748499
[LightGBM] [Info] Number of positive: 404715, number of negative: 191461
[LightGBM] [Info] Total Bins 1528
[LightGBM] [Info] Number of data points in the train set: 596176, number of used features: 237
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.678852 -> initscore=0.748499
[LightGBM] [Info] Start training from score 0.74849

In [22]:
y_pred_stacking_proba = stacking_model.predict_proba(X_test)[:, 1]
y_pred_stacking_class = stacking_model.predict(X_test)

print("Staking evaluate results:")
print(f"AUC: {roc_auc_score(y_test, y_pred_stacking_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stacking_class):.4f}")
print(f"F1: {f1_score(y_test, y_pred_stacking_class, average='weighted'):.4f}")


final_meta_model = stacking_model.final_estimator_
# ضرایب متا-مدل
print("meta-model coeff (lgbm, xgb)", final_meta_model.coef_)


Staking evaluate results:
AUC: 0.7881
Accuracy: 0.7446
F1: 0.7330
meta-model coeff (lgbm, xgb) [[2.49886214 2.66476161]]


این ضرایب به شما می‌گویند که متا-مدل چقدر به پیش‌بینی‌های هر یک از مدل‌های پایه "وزن" یا "اهمیت" داده است.