In [15]:
#Decision Tree
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load your preprocessed dataset
data = pd.read_csv('Model1.csv')  # Replace 'your_preprocessed_dataset.csv' with your dataset filename

# Separate features and target variable
X = data.drop(['DIABETE4', 'SEQNO'], axis=1)  # Features (dropping 'SEQNO')
y = data['DIABETE4']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate DecisionTreeClassifier and fit the model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Get feature importance
feature_importance = dt.feature_importances_

# Create a DataFrame to store feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print key predictors with their importance
print("Key Predictors and their Importance:")
print(feature_importance_df)

# Save feature importance to a CSV file
feature_importance_df.to_csv("DecisionTrees_Importance_scores.csv", index=False)

# Make predictions
y_pred = dt.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy * 100:.2f}%")

Key Predictors and their Importance:
       Feature  Importance
21      _MICHD    0.004565
7     CVDINFR4    0.003614
13    DIFFWALK    0.003003
4     MEDCOST1    0.000760
1       _STATE    0.000000
16    _METSTAT    0.000000
0           ID         NaN
2       FMONTH         NaN
3      GENHLTH         NaN
5     CHECKUP1         NaN
6     SLEPTIM1         NaN
8     ADDEPEV3         NaN
9      MARITAL         NaN
10    RENTHOM1         NaN
11     EMPLOY1         NaN
12    PREGNANT         NaN
14    SDHSTRE1         NaN
15     QSTLANG         NaN
17    _PHYS14D         NaN
18    _MENT14D         NaN
19    _HLTHPLN         NaN
20    _TOTINDA         NaN
22    _ASTHMS1         NaN
23    _RACEPR1         NaN
24        _SEX         NaN
25      _AGE_G         NaN
26        HTM4         NaN
27       WTKG3         NaN
28       _BMI5         NaN
29    _BMI5CAT         NaN
30    _CHLDCNT         NaN
31     _EDUCAG         NaN
32    _INCOMG1         NaN
33    _SMOKER3         NaN
34     ALCDAY4    

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load your preprocessed dataset
data = pd.read_csv('Model1.csv')  # Replace 'Model1.csv' with your dataset filename

# Specify columns of interest
columns_of_interest = [
    'GENHLTH', '_AGE_G', '_SEX', 'PREGNANT', 'CVDINFR4', '_MICHD', 'CHECKUP1', '_RACEPR1',
    '_TOTINDA', 'ALCCALC', '_BMI5', 'ALCDAY30', 'EMPLOY1', '_EDUCAG', '_MENT14D', 'AVEDRNK3',
    'ID', 'MEDCOST1', 'ALCDAY4', 'HTM4', '_BMI5CAT', '_METSTAT', 'FMONTH', '_ASTHMS1',
    '_INCOMG1', 'DIFFWALK', 'QSTLANG', 'WTKG3', '_PHYS14D', 'MARITAL', 'SDHSTRE1', 'RENTHOM1',
    '_CHLDCNT', 'ADDEPEV3', '_SMOKER3', 'SLEPTIM1', 'ALCCALCCAT', '_HLTHPLN', '_STATE'
]

# Separate features and target variable
X = data[columns_of_interest]  # Features
y = data['DIABETE4']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')  # You can choose other strategies like 'median' or 'most_frequent'
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Instantiate KNeighborsClassifier and fit the model
knn = KNeighborsClassifier(n_neighbors=5)  # You can change the value of 'n_neighbors'
knn.fit(X_train_imputed, y_train)

# Make predictions
y_pred = knn.predict(X_test_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy * 100:.2f}%")


KNN Accuracy: 84.87%


In [None]:
#KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load your preprocessed dataset
data = pd.read_csv('Model1.csv')  # Replace 'Model1.csv' with your dataset filename

# Specify columns of interest
columns_of_interest = [
    'GENHLTH', '_AGE_G', '_SEX', 'PREGNANT', 'CVDINFR4', '_MICHD', 'CHECKUP1', '_RACEPR1',
    '_TOTINDA', 'ALCCALC', '_BMI5', 'ALCDAY30', 'EMPLOY1', '_EDUCAG', '_MENT14D', 'AVEDRNK3',
    'ID', 'MEDCOST1', 'ALCDAY4', 'HTM4', '_BMI5CAT', '_METSTAT', 'FMONTH', '_ASTHMS1',
    '_INCOMG1', 'DIFFWALK', 'QSTLANG', 'WTKG3', '_PHYS14D', 'MARITAL', 'SDHSTRE1', 'RENTHOM1',
    '_CHLDCNT', 'ADDEPEV3', '_SMOKER3', 'SLEPTIM1', 'ALCCALCCAT', '_HLTHPLN', '_STATE'
]

# Separate features and target variable
X = data[columns_of_interest]  # Features
y = data['DIABETE4']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Instantiate KNeighborsClassifier and fit the model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_imputed, y_train)

# Make predictions
y_pred = knn.predict(X_test_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy * 100:.2f}%")

# Feature importance estimation based on accuracy change
feature_importance = []
for col in columns_of_interest:
    cols_to_drop = [c for c in columns_of_interest if c != col]
    X_train_subset = X_train_imputed[:, [columns_of_interest.index(c) for c in cols_to_drop]]
    X_test_subset = X_test_imputed[:, [columns_of_interest.index(c) for c in cols_to_drop]]

    knn.fit(X_train_subset, y_train)
    y_pred_subset = knn.predict(X_test_subset)
    accuracy_subset = accuracy_score(y_test, y_pred_subset)

    feature_importance.append({'Feature': col, 'Accuracy': accuracy_subset})

# Save feature importance to a CSV file
feature_importance_df = pd.DataFrame(feature_importance)
feature_importance_df = feature_importance_df.sort_values(by='Accuracy', ascending=False)
feature_importance_df.to_csv("KNN_Feature_Importance.csv", index=False)
print("Feature importance saved to 'KNN_Feature_Importance.csv'")


KNN Accuracy: 84.87%
Feature importance saved to 'KNN_Feature_Importance.csv'
