In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv('Model1.csv')  # Load your preprocessed dataset

X = data.drop(['DIABETE4', 'SEQNO'], axis=1)  # Features (excluding 'DIABETE4' and 'SEQNO')
y = data['DIABETE4']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
hist_gradient_boosting = HistGradientBoostingClassifier(random_state=42)

# Fit the model
hist_gradient_boosting.fit(X_train, y_train)

In [6]:
# Make predictions
y_pred = hist_gradient_boosting.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using HistGradientBoostingClassifier: {accuracy * 100:.2f}%")

Accuracy using HistGradientBoostingClassifier: 85.14%


In [10]:
from sklearn.inspection import permutation_importance

# Fit the HistGradientBoostingClassifier model
hist_gradient_boosting.fit(X_train, y_train)

# Calculate permutation feature importance
result = permutation_importance(hist_gradient_boosting, X_test, y_test, n_repeats=10, random_state=42)

# Create a DataFrame to display feature importances
importance = result.importances_mean
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importance Ranking:")
print(feature_importance_df)

# Alternatively, to print the entire DataFrame without truncation
pd.set_option('display.max_rows', None)
print("\nEntire DataFrame:")
print(feature_importance_df)
feature_importance_df.to_csv("importance_score.csv")


Feature Importance Ranking:
       Feature  Importance
28       _BMI5    0.003668
25      _AGE_G    0.002109
5     CHECKUP1    0.001559
23    _RACEPR1    0.001421
32    _INCOMG1    0.001421
7     CVDINFR4    0.000917
9      MARITAL    0.000596
24        _SEX    0.000459
19    _HLTHPLN    0.000321
11     EMPLOY1    0.000321
13    DIFFWALK    0.000092
36    ALCDAY30    0.000046
33    _SMOKER3    0.000046
4     MEDCOST1    0.000046
16    _METSTAT    0.000046
1       _STATE    0.000000
38  ALCCALCCAT    0.000000
15     QSTLANG    0.000000
29    _BMI5CAT   -0.000275
27       WTKG3   -0.000321
8     ADDEPEV3   -0.000459
14    SDHSTRE1   -0.000459
17    _PHYS14D   -0.000504
30    _CHLDCNT   -0.000642
6     SLEPTIM1   -0.000688
18    _MENT14D   -0.000688
31     _EDUCAG   -0.000688
35    AVEDRNK3   -0.000779
21      _MICHD   -0.000871
2       FMONTH   -0.001146
10    RENTHOM1   -0.001238
12    PREGNANT   -0.001467
20    _TOTINDA   -0.001513
22    _ASTHMS1   -0.001696
26        HTM4   -0.001696
