<a href="https://colab.research.google.com/github/aymenchibouti/newversion/blob/master/best_tuning_model_XAI_res.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install shap
!pip install lime
!pip install xgboost
!pip install imblearn
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=4665bccab2764e9999a225db3e3372f20a269644bc5b79a74689b89ed0f86ee2
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shap
import lime
import lime.lime_tabular
import xgboost as xgb
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# Load the dataset
file_path = 'model1_210_features.csv'
data = pd.read_csv(file_path)

# Drop non-numeric columns that are not useful for prediction
data = data.drop(columns=['username', 'course_id', 'enrollment_id'])

# Handle missing values (fill with 0 or use mean/median imputation as necessary)
data.fillna(0, inplace=True)

# Separate features and target variable
X = data.drop(columns=['dropout'])  # Features
y = data['dropout']  # Target variable

# Standardize the features (important for models like Logistic Regression and XGBoost)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE (oversampling the minority class)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# RandomizedSearchCV for hyperparameter tuning (for Random Forest)
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
rf_random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=100, cv=3, random_state=42)
rf_random_search.fit(X_train, y_train)

# XGBoost RandomizedSearchCV (hyperparameter tuning for XGBoost)
param_dist_xgb = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist_xgb, n_iter=100, cv=3, random_state=42)
xgb_random_search.fit(X_train, y_train)

# Logistic Regression with class weight to handle imbalance
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)

# Best models from RandomizedSearchCV
best_rf = rf_random_search.best_estimator_
best_xgb = xgb_random_search.best_estimator_

# Predictions and evaluation for each model
models = [lr, best_rf, best_xgb]
model_names = ['Logistic Regression', 'Random Forest (Tuned)', 'XGBoost (Tuned)']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    print(f"Evaluation for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("-" * 50)

# Explainability using SHAP (SHAP can work with tree-based models like RandomForest and XGBoost)
explainer_rf = shap.TreeExplainer(best_rf)
shap_values_rf = explainer_rf.shap_values(X_test)

# SHAP summary plot for Random Forest
shap.summary_plot(shap_values_rf, X_test, feature_names=X.columns)

# LIME - Local Interpretable Model-Agnostic Explanations
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    training_labels=y_train,
    mode="classification",
    feature_names=X.columns,
    class_names=["No Dropout", "Dropout"],
    verbose=True,
    random_state=42
)

# Pick a single instance for LIME explanation
instance = X_test[0]  # First instance in the test set
explanation_lime = explainer_lime.explain_instance(instance, best_xgb.predict_proba)

# Visualize LIME explanation
explanation_lime.show_in_notebook()

# Feature Importance (using Random Forest)
feature_importance = best_rf.feature_importances_
# Visualize the feature importance
plt.figure(figsize=(10, 8))
plt.barh(X.columns, feature_importance)
plt.title("Feature Importance (Random Forest - Tuned)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
import warnings
warnings.filterwarnings('ignore')

# Deep Learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2

# Load the dataset
file_path = 'model1_210_features.csv'  # Update with your file path
data = pd.read_csv(file_path)

print(f"Dataset shape: {data.shape}")
print(f"Target distribution:\n{data['dropout'].value_counts()}")
print(f"Class distribution: {data['dropout'].value_counts(normalize=True)}")

# Drop non-numeric columns that are not useful for prediction
data = data.drop(columns=['username', 'course_id', 'enrollment_id'])

# Handle missing values
data.fillna(0, inplace=True)

# Separate features and target variable
X = data.drop(columns=['dropout'])
y = data['dropout']

print(f"Features shape: {X.shape}")
print(f"Feature names: {list(X.columns[:10])}...")  # Show first 10 features

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance - try different resampling strategies
print("\n=== Testing Different Resampling Strategies ===")

# 1. SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_smote, y_smote = smote.fit_resample(X_scaled, y)

# 2. ADASYN
adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_scaled, y)

# 3. SMOTETomek (combination of over and under sampling)
smotetomek = SMOTETomek(sampling_strategy='auto', random_state=42)
X_smotetomek, y_smotetomek = smotetomek.fit_resample(X_scaled, y)

# Choose the best resampling strategy (we'll use SMOTE for now)
X_resampled, y_resampled = X_smote, y_smote
print(f"After resampling: {X_resampled.shape}, Target distribution: {np.bincount(y_resampled)}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\n=== Model Training and Hyperparameter Tuning ===")

# Dictionary to store results
results = {}

# 1. XGBoost with extensive hyperparameter tuning
print("1. Training XGBoost...")
xgb_params = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    n_iter=200,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
xgb_random_search.fit(X_train, y_train)
results['XGBoost'] = xgb_random_search.best_estimator_

print(f"Best XGBoost params: {xgb_random_search.best_params_}")
print(f"Best XGBoost CV score: {xgb_random_search.best_score_:.4f}")

# 2. LightGBM
print("2. Training LightGBM...")
lgb_params = {
    'num_leaves': [31, 50, 70, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'feature_fraction': [0.8, 0.9, 1.0],
    'bagging_fraction': [0.8, 0.9, 1.0],
    'bagging_freq': [1, 5, 10],
    'min_child_samples': [5, 10, 20],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
lgb_random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=lgb_params,
    n_iter=150,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
lgb_random_search.fit(X_train, y_train)
results['LightGBM'] = lgb_random_search.best_estimator_

print(f"Best LightGBM params: {lgb_random_search.best_params_}")
print(f"Best LightGBM CV score: {lgb_random_search.best_score_:.4f}")

# 3. CatBoost
print("3. Training CatBoost...")
catboost_params = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'iterations': [100, 200, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128],
    'random_strength': [1, 2, 3]
}

catboost_model = cb.CatBoostClassifier(random_state=42, verbose=False)
catboost_random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=catboost_params,
    n_iter=100,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
catboost_random_search.fit(X_train, y_train)
results['CatBoost'] = catboost_random_search.best_estimator_

print(f"Best CatBoost params: {catboost_random_search.best_params_}")
print(f"Best CatBoost CV score: {catboost_random_search.best_score_:.4f}")

# 4. Random Forest with extensive tuning
print("4. Training Random Forest...")
rf_params = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_params,
    n_iter=150,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
rf_random_search.fit(X_train, y_train)
results['Random Forest'] = rf_random_search.best_estimator_

print(f"Best Random Forest params: {rf_random_search.best_params_}")
print(f"Best Random Forest CV score: {rf_random_search.best_score_:.4f}")

# 5. Extra Trees
print("5. Training Extra Trees...")
et_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

et_model = ExtraTreesClassifier(random_state=42)
et_random_search = RandomizedSearchCV(
    estimator=et_model,
    param_distributions=et_params,
    n_iter=100,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
et_random_search.fit(X_train, y_train)
results['Extra Trees'] = et_random_search.best_estimator_

# 6. Gradient Boosting
print("6. Training Gradient Boosting...")
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

gb_model = GradientBoostingClassifier(random_state=42)
gb_random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_params,
    n_iter=100,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
gb_random_search.fit(X_train, y_train)
results['Gradient Boosting'] = gb_random_search.best_estimator_

# 7. SVM
print("7. Training SVM...")
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svm_model = SVC(random_state=42, probability=True)
svm_random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=svm_params,
    n_iter=50,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
svm_random_search.fit(X_train, y_train)
results['SVM'] = svm_random_search.best_estimator_

# 8. Neural Network (sklearn)
print("8. Training MLP Classifier...")
mlp_params = {
    'hidden_layer_sizes': [(100,), (200,), (100, 50), (200, 100), (300, 200, 100)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [1000]
}

mlp_model = MLPClassifier(random_state=42)
mlp_random_search = RandomizedSearchCV(
    estimator=mlp_model,
    param_distributions=mlp_params,
    n_iter=50,
    cv=cv_strategy,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
mlp_random_search.fit(X_train, y_train)
results['MLP'] = mlp_random_search.best_estimator_

# 9. Deep Learning with Keras/TensorFlow
print("9. Training Deep Neural Network...")

def create_dnn_model(input_dim, dropout_rate=0.3, l1_reg=0.01, l2_reg=0.01):
    model = Sequential([
        Dense(512, activation='relu', input_dim=input_dim,
              kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg)),
        BatchNormalization(),
        Dropout(dropout_rate),

        Dense(256, activation='relu',
              kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg)),
        BatchNormalization(),
        Dropout(dropout_rate),

        Dense(128, activation='relu',
              kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg)),
        BatchNormalization(),
        Dropout(dropout_rate),

        Dense(64, activation='relu',
              kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg)),
        Dropout(dropout_rate),

        Dense(1, activation='sigmoid')
    ])
    return model

# Best DNN model after some manual tuning
dnn_model = create_dnn_model(X_train.shape[1], dropout_rate=0.4, l1_reg=0.001, l2_reg=0.01)
dnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=8, min_lr=0.0001)

# Train DNN
history = dnn_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=128,
    callbacks=[early_stopping, reduce_lr],
    verbose=0
)

results['Deep Neural Network'] = dnn_model

print("\n=== Model Evaluation ===")

# Evaluate all models
evaluation_results = []

for name, model in results.items():
    if name == 'Deep Neural Network':
        # For Keras model
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    else:
        # For sklearn models
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    evaluation_results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc
    })

    print(f"\n{name}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  AUC-ROC:   {auc:.4f}")

# Create results DataFrame
results_df = pd.DataFrame(evaluation_results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n=== Final Rankings ===")
print(results_df.to_string(index=False))

# Visualization
plt.figure(figsize=(15, 10))

# Accuracy comparison
plt.subplot(2, 2, 1)
plt.barh(results_df['Model'], results_df['Accuracy'])
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.xlim(0, 1)
for i, v in enumerate(results_df['Accuracy']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')

# F1-Score comparison
plt.subplot(2, 2, 2)
plt.barh(results_df['Model'], results_df['F1-Score'])
plt.title('Model F1-Score Comparison')
plt.xlabel('F1-Score')
plt.xlim(0, 1)
for i, v in enumerate(results_df['F1-Score']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')

# AUC-ROC comparison
plt.subplot(2, 2, 3)
plt.barh(results_df['Model'], results_df['AUC-ROC'])
plt.title('Model AUC-ROC Comparison')
plt.xlabel('AUC-ROC')
plt.xlim(0, 1)
for i, v in enumerate(results_df['AUC-ROC']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')

# All metrics heatmap
plt.subplot(2, 2, 4)
metrics_for_heatmap = results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']]
sns.heatmap(metrics_for_heatmap, annot=True, cmap='viridis', fmt='.4f')
plt.title('All Metrics Heatmap')

plt.tight_layout()
plt.show()

# Feature importance for the best tree-based model
best_model_name = results_df.iloc[0]['Model']
if best_model_name in ['XGBoost', 'LightGBM', 'CatBoost', 'Random Forest', 'Extra Trees', 'Gradient Boosting']:
    best_model = results[best_model_name]

    if hasattr(best_model, 'feature_importances_'):
        feature_importance = best_model.feature_importances_
    elif hasattr(best_model, 'get_feature_importance'):  # CatBoost
        feature_importance = best_model.get_feature_importance()
    else:
        feature_importance = None

    if feature_importance is not None:
        # Get top 20 most important features
        feature_names = X.columns
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False).head(20)

        plt.figure(figsize=(12, 8))
        plt.barh(importance_df['feature'], importance_df['importance'])
        plt.title(f'Top 20 Feature Importances - {best_model_name}')
        plt.xlabel('Importance')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

print(f"\n=== Best Model: {results_df.iloc[0]['Model']} ===")
print(f"Best Accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"Best F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")
print(f"Best AUC-ROC: {results_df.iloc[0]['AUC-ROC']:.4f}")

# Save the best model
best_model = results[results_df.iloc[0]['Model']]
if results_df.iloc[0]['Model'] != 'Deep Neural Network':
    import joblib
    joblib.dump(best_model, f'best_model_{results_df.iloc[0]["Model"].replace(" ", "_").lower()}.pkl')
    print(f"\nBest model saved as: best_model_{results_df.iloc[0]['Model'].replace(' ', '_').lower()}.pkl")
else:
    best_model.save('best_model_deep_neural_network.h5')
    print(f"\nBest model saved as: best_model_deep_neural_network.h5")

Dataset shape: (120542, 214)
Target distribution:
dropout
1    95581
0    24961
Name: count, dtype: int64
Class distribution: dropout
1    0.792927
0    0.207073
Name: proportion, dtype: float64
Features shape: (120542, 210)
Feature names: ['day_1_access', 'day_1_problem', 'day_1_wiki', 'day_1_discussion', 'day_1_navigate', 'day_1_page_close', 'day_1_video', 'day_2_access', 'day_2_problem', 'day_2_wiki']...

=== Testing Different Resampling Strategies ===
After resampling: (191162, 210), Target distribution: [95581 95581]

=== Model Training and Hyperparameter Tuning ===
1. Training XGBoost...
Best XGBoost params: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.15, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best XGBoost CV score: 0.8805
2. Training LightGBM...
Best LightGBM params: {'reg_lambda': 0, 'reg_alpha': 0.5, 'num_leaves': 100, 'n_estimators': 300, 'min_child_samples': 20, 'learning_rate': 0.1, 'fe