In [4]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load your preprocessed dataset
data = pd.read_csv('Model1.csv')  # Replace 'your_preprocessed_dataset.csv' with your dataset filename

# Separate features and target variable
X = data.drop(['DIABETE4', 'SEQNO'], axis=1)  # Features (dropping 'SEQNO')
y = data['DIABETE4']  # Target variable

# Encode target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Instantiate XGBoost classifier
xgb = XGBClassifier(random_state=42)

# Fit the model
xgb.fit(X_train, y_train_encoded)

# Get feature importance
feature_importance = xgb.feature_importances_

# Create a DataFrame to store feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Make predictions on the test set
y_pred_encoded = xgb.predict(X_test)

# Decode predictions
y_pred_decoded = label_encoder.inverse_transform(y_pred_encoded)

# Decode test target variable for evaluation
y_test_decoded = label_encoder.inverse_transform(y_test_encoded)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print key predictors with their importance
print("Key Predictors and their Importance:")
print(feature_importance_df)

# Export feature importance to a CSV file
feature_importance_df.to_csv("XGBoost importance.csv")


Accuracy: 84.87%
Key Predictors and their Importance:
       Feature  Importance
3      GENHLTH    0.064079
25      _AGE_G    0.061278
24        _SEX    0.058734
12    PREGNANT    0.048081
7     CVDINFR4    0.043786
21      _MICHD    0.037310
5     CHECKUP1    0.030611
23    _RACEPR1    0.027776
20    _TOTINDA    0.026138
37     ALCCALC    0.025427
28       _BMI5    0.024580
36    ALCDAY30    0.023270
11     EMPLOY1    0.022580
31     _EDUCAG    0.022302
18    _MENT14D    0.022127
35    AVEDRNK3    0.021877
0           ID    0.021838
4     MEDCOST1    0.021688
34     ALCDAY4    0.021628
26        HTM4    0.021603
29    _BMI5CAT    0.021582
16    _METSTAT    0.021466
2       FMONTH    0.021295
22    _ASTHMS1    0.021287
32    _INCOMG1    0.021251
13    DIFFWALK    0.020711
15     QSTLANG    0.020327
27       WTKG3    0.020316
17    _PHYS14D    0.020281
9      MARITAL    0.020147
14    SDHSTRE1    0.019901
10    RENTHOM1    0.019343
30    _CHLDCNT    0.019249
8     ADDEPEV3    0.019207
3