In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
 
# Load your dataset from an Excel file
df = pd.read_csv("Model1.csv")
 
# Drop the 'SEQNO' column
df = df.drop(columns=['SEQNO'])
 
# Drop rows with NaN values (or use imputation)
df = df.dropna()
 
# Convert categorical variables to dummy variables
df = pd.get_dummies(df, drop_first=True)
 
# Splitting the dataset into training and testing sets
X = df.drop('DIABETE4', axis=1)
y = df['DIABETE4']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# Fit the KNN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
 
# Evaluate the original model
original_accuracy = accuracy_score(y_test, knn.predict(X_test_scaled))
 
# Assessing the importance of each feature
feature_importances = {}
 
for feature in X_train.columns:
    # Create a dataset without the feature
    X_train_reduced = X_train.drop(columns=[feature])
    X_test_reduced = X_test.drop(columns=[feature])
 
    # Scaling the reduced dataset
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)
 
    # Create and train a new model
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_reduced_scaled, y_train)
 
    # Evaluate the model
    reduced_accuracy = accuracy_score(y_test, model.predict(X_test_reduced_scaled))
 
    # The importance is the change in accuracy
    importance = original_accuracy - reduced_accuracy
    feature_importances[feature] = importance
 
# Sort features by importance
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
 
print("Feature Importances:")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

accuracy = original_accuracy  # Use the original accuracy computed earlier
print("Accuracy of the KNN model:", accuracy)


Feature Importances:
_SEX: 0.004105090311986803
FMONTH: 0.0024630541871920597
WTKG3: 0.0024630541871920597
_BMI5: 0.0024630541871920597
_SMOKER3: 0.0024630541871920597
SLEPTIM1: 0.0016420361247947435
RENTHOM1: 0.0016420361247947435
AVEDRNK3: 0.0016420361247947435
ID: 0.0008210180623973162
MEDCOST1: 0.0008210180623973162
MARITAL: 0.0008210180623973162
PREGNANT: 0.0008210180623973162
SDHSTRE1: 0.0008210180623973162
_HLTHPLN: 0.0008210180623973162
_RACEPR1: 0.0008210180623973162
_CHLDCNT: 0.0008210180623973162
_STATE: 0.0
GENHLTH: 0.0
CHECKUP1: 0.0
DIFFWALK: 0.0
QSTLANG: 0.0
_METSTAT: 0.0
_BMI5CAT: 0.0
ALCDAY30: 0.0
ALCCALC: 0.0
CVDINFR4: -0.0008210180623974273
_PHYS14D: -0.0008210180623974273
_TOTINDA: -0.0008210180623974273
_ASTHMS1: -0.0008210180623974273
_EDUCAG: -0.0008210180623974273
ALCDAY4: -0.0008210180623974273
ALCCALCCAT: -0.0008210180623974273
EMPLOY1: -0.0016420361247947435
_MENT14D: -0.0016420361247947435
_MICHD: -0.0024630541871921707
_AGE_G: -0.0024630541871921707
HTM4: -0