In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import accuracy_score
from scipy.stats import skew
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
# Loading the data
data = pd.read_csv('diabetes.csv')

In [3]:
# Replace unwanted values with NaN (null)
data.loc[data['Glucose'] < 20, 'Glucose'] = np.nan
data.loc[data['SkinThickness'] == 0 , 'SkinThickness'] = np.nan
data.loc[data['BMI'] < 9, 'BMI'] = np.nan
data.loc[data['BloodPressure'] == 0 , 'BloodPressure'] = np.nan

# Impute missing values with the median
cols_to_impute = ['Glucose', 'SkinThickness', 'BMI', 'BloodPressure']
for col in cols_to_impute:
    median_val = data[col].median()
    data[col].fillna(median_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(median_val, inplace=True)


In [4]:
# Drop unnecessary features
data = data.drop(columns=['Insulin', 'SkinThickness'])

In [5]:
X = data.drop(columns=["Outcome"])
y = data["Outcome"]

In [6]:
# Random Forest Classifier (raw data)
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.3, random_state=849)

# Building the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=849)
rf_model.fit(X_train_rf, y_train_rf)

# Making predictions
y_pred_rf = rf_model.predict(X_test_rf)
    
# Calculating accuracy of the model
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)

In [7]:
# Logistic Regression (raw data)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, y, test_size=0.3, random_state=142)

# Building the Logistic Regression model
lr_model = LogisticRegression(max_iter=200, random_state=142)
lr_model.fit(X_train_lr, y_train_lr)

# Making predictions
y_pred_lr = lr_model.predict(X_test_lr)

# Calculating accuracy of the model
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)

In [8]:
# Handeling Skeweness
data_transformed = data.copy()
numeric_cols = data_transformed.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols[numeric_cols != 'Outcome']

# Calculate initial skewness
initial_skew = data_transformed[numeric_cols].apply(skew)
skewed_initial = initial_skew[abs(initial_skew) > 0.5]

# Apply Yeo-Johnson transformation to highly skewed features
pt = PowerTransformer(method='yeo-johnson')
data_transformed[skewed_initial.index] = pt.fit_transform(data_transformed[skewed_initial.index])

# Recalculate skewness after Yeo-Johnson
skew_after_pt = data_transformed[skewed_initial.index].apply(skew)
still_skewed = skew_after_pt[abs(skew_after_pt) > 0.5]

# Ensure 'Outcome' column is not affected
data_transformed['Outcome'] = data['Outcome']

In [9]:
X = data_transformed.drop(columns=['Outcome'])
y = data_transformed['Outcome']

In [10]:
# Multi-Layer Perceptron (de-skewed)
X_train_mlpd, X_test_mlpd, y_train_mlpd, y_test_mlpd = train_test_split(X, y, test_size=0.3, random_state=585)

# Building MLP model
mlpd_model = MLPClassifier(random_state=585, max_iter=500, solver='adam', hidden_layer_sizes=(100,))
mlpd_model.fit(X_train_mlpd, y_train_mlpd)

# Making predictions
y_pred_mlpd = mlpd_model.predict(X_test_mlpd)

# Calculating accuracy of the model
accuracy_mlpd = accuracy_score(y_test_mlpd, y_pred_mlpd)

In [11]:
# Support Vector Machine with Radial Basis Function Kernel (scaled)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, y, test_size=0.3, random_state=645)

# Standard Scaler
scaler_svm = StandardScaler()
X_train_svm_std = scaler_svm.fit_transform(X_train_svm)
X_test_svm_std = scaler_svm.transform(X_test_svm)

# Building SVM model
svm_scaled_model = SVC(kernel='rbf', probability=True, random_state=645)
svm_scaled_model.fit(X_train_svm_std, y_train_svm)

# Making predictions
y_pred_svm = svm_scaled_model.predict(X_test_svm_std)

# Calculating accuracy of the model                                   
accuracy_svm = accuracy_score(y_test_svm, y_pred_svm)

In [12]:
# Multi-Layer Perceptron (scaled)
X_train_mlp, X_test_mlp, y_train_mlp, y_test_mlp = train_test_split(X, y, test_size=0.3, random_state=742)

# Standard Scaler
scaler_mlp = StandardScaler()
X_train_mlp_std = scaler_mlp.fit_transform(X_train_mlp)
X_test_mlp_std = scaler_mlp.transform(X_test_mlp)

# Building MLP model
mlp_scaled_model = MLPClassifier(random_state=742, max_iter=3000, solver='adam', hidden_layer_sizes=(100,))
mlp_scaled_model.fit(X_train_mlp_std, y_train_mlp)

# Making predictions
y_pred_mlp = mlp_scaled_model.predict(X_test_mlp_std)

# Calculating accuracy of the model                                   
accuracy_mlp = accuracy_score(y_test_mlp, y_pred_mlp)

In [13]:
preg = float(input("🔹 Number of Pregnancies: "))
glu = float(input("🔹 Glucose Level: "))
bp = float(input("🔹 Blood Pressure: "))
bmi = float(input("🔹 Body Mass Index (BMI): "))
ped = float(input("🔹 Family History of Diabetes: "))
age = float(input("🔹 Age: "))

🔹 Number of Pregnancies:  0
🔹 Glucose Level:  120
🔹 Blood Pressure:  128
🔹 Body Mass Index (BMI):  34
🔹 Family History of Diabetes:  0
🔹 Age:  23


In [14]:
# Creating Dataframe
user_data = pd.DataFrame([{
    'Pregnancies': preg,
    'Glucose': glu,
    'BloodPressure': bp,
    'BMI': bmi,
    'DiabetesPedigreeFunction': ped,
    'Age': age
}])

In [15]:
# Random Forest (raw)
rf_prob = rf_model.predict_proba(user_data)[0][1]
print(f"🌲 Random Forest (raw): {rf_prob*100:.2f}%")

# Logistic Regression (raw)
logreg_prob = lr_model.predict_proba(user_data)[0][1]
print(f"📈 Logistic Regression (raw): {logreg_prob*100:.2f}%")

🌲 Random Forest (raw): 51.00%
📈 Logistic Regression (raw): 17.24%


In [16]:
# Same Skeweness Handeling
user_transformed = user_data.copy()

# Columns to Transform
yeo_columns = ['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Applying Yeo-Johnson
pt = PowerTransformer(method='yeo-johnson')
user_transformed[yeo_columns] = pt.fit_transform(user_transformed[yeo_columns])

In [17]:
# MLP (un-skewed)
mlp_deskewed_prob = mlpd_model.predict_proba(user_transformed)[0][1]
print(f"🧠 MLP (de-skewed): {mlp_deskewed_prob*100:.2f}%")

🧠 MLP (de-skewed): 20.69%


In [18]:
# Scaling
user_scaled = user_transformed.copy()

user_scaled = scaler_mlp.transform(user_scaled) # برای MLP (scaled)
user_svm = scaler_svm.transform(user_transformed) # برای SVM

In [19]:
# MLP (scaled)
mlp_scaled_prob = mlp_scaled_model.predict_proba(user_scaled)[0][1]
print(f"⚙️ MLP (scaled): {mlp_scaled_prob*100:.2f}%")

# SVM (scaled)
svm_prob = svm_scaled_model.predict_proba(user_svm)[0][1]
print(f"🔻 SVM (scaled): {svm_prob*100:.2f}%")

⚙️ MLP (scaled): 0.00%
🔻 SVM (scaled): 38.09%


In [20]:
probs = [rf_prob, logreg_prob, mlp_deskewed_prob, mlp_scaled_prob, svm_prob]

In [21]:
avg_prob = sum(probs) / len(probs)
print(f"\n📌 Final average probability of diabetes risk: {avg_prob*100:.2f}%")


📌 Final average probability of diabetes risk: 25.40%


In [22]:
print(f"🌲 Random Forest (raw): {rf_prob*100:.2f}%")
print(f"📈 Logistic Regression (raw): {logreg_prob*100:.2f}%")
print(f"🧠 MLP (deskewed): {mlp_deskewed_prob*100:.2f}%")
print(f"⚙️ MLP (scaled): {mlp_scaled_prob*100:.2f}%")
print(f"🔻 SVM (scaled): {svm_prob*100:.2f}%")
print(f"\n📌 Final average probability of diabetes risk: {avg_prob*100:.2f}%")

🌲 Random Forest (raw): 51.00%
📈 Logistic Regression (raw): 17.24%
🧠 MLP (deskewed): 20.69%
⚙️ MLP (scaled): 0.00%
🔻 SVM (scaled): 38.09%

📌 Final average probability of diabetes risk: 25.40%
