In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import zscore
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

print("\nDataset Overview:")
print(pd.DataFrame({'Columns': data.columns, 'Missing Values': data.isnull().sum(), 'Data Type': data.dtypes}))

data['HbA1c'] = data['HbA1c'].replace(['???', 'Unknown'], np.nan).astype(float)

data['HbA1c'] = data.groupby('Diabetes_Status')['HbA1c'].transform(lambda x: x.fillna(x.median()))

if 'Diet_Quality' in data.columns and 'Physical_Activity_Level' in data.columns:
    ordinal_encoder = OrdinalEncoder()
    data[['Diet_Quality', 'Physical_Activity_Level']] = ordinal_encoder.fit_transform(data[['Diet_Quality', 'Physical_Activity_Level']])

if 'Smoking_Status' in data.columns and 'Family_History' in data.columns:
    one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
    one_hot_encoded = one_hot_encoder.fit_transform(data[['Smoking_Status', 'Family_History']])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(['Smoking_Status', 'Family_History']))
    data = pd.concat([data.drop(columns=['Smoking_Status', 'Family_History']), one_hot_df], axis=1)

scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
if 'Glucose_Level' in data.columns and 'Insulin_Level' in data.columns:
    data[['Glucose_Level', 'Insulin_Level']] = scaler_standard.fit_transform(data[['Glucose_Level', 'Insulin_Level']])
if 'BMI' in data.columns and 'Age' in data.columns:
    data[['BMI', 'Age']] = scaler_minmax.fit_transform(data[['BMI', 'Age']])

def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

for col in ['Glucose_Level', 'Insulin_Level', 'BMI', 'Age']:
    if col in data.columns:
        data = remove_outliers(data, col)

X = data.drop(columns=['Diabetes_Status'])
y = data['Diabetes_Status']

X = X.apply(pd.to_numeric, errors='coerce')

model = RandomForestClassifier()
model.fit(X, y)
importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importances.to_string(index=False))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

duplicates = X_train_resampled.duplicated().sum()
print(f"\nDuplicate Records in Training Set: {duplicates}")

print("\nPreprocessing Complete. Ready for Modeling.")


Saving diabetic_prediction_dataset_v2.csv to diabetic_prediction_dataset_v2 (6).csv

Dataset Overview:
                                         Columns  Missing Values Data Type
Diabetes_Status                  Diabetes_Status               0     int64
Diet_Quality                        Diet_Quality               0    object
Physical_Activity_Level  Physical_Activity_Level               0    object
Smoking_Status                    Smoking_Status               0    object
Family_History                    Family_History               0    object
Glucose_Level                      Glucose_Level               0   float64
Insulin_Level                      Insulin_Level               0   float64
BMI                                          BMI               0   float64
Age                                          Age               0     int64
HbA1c                                      HbA1c               0    object

Feature Importances:
                     Feature  Importance
         