Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Loading Datasets

In [None]:
df = pd.read_excel("/diabetes.xlsx", sheet_name='diabetes')
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


Exploring the Data

In [None]:
print(df.info())
print(df.describe())
print(df.isnull().sum())  # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std    

Replace Invalid Zeros with NaN

In [None]:
# Replace zeros with NaN for specific columns since they are biologically invalid
cols_with_zero_invalid = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, np.nan)
print(df.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


Filling Missing Values

In [None]:
# Fill missing values with median
df.fillna(df.median(), inplace=True)

In [None]:
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


Train-Test Split and Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training the model

In [None]:
from sklearn.linear_model import LogisticRegression

#importing model
model = LogisticRegression()
model.fit(X_train, y_train)

Evaluating the model

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#predicting
y_pred = model.predict(X_test)

#acquring model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7532467532467533
[[82 17]
 [21 34]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



Prediction System

In [None]:
print("\n Enter Patient Details to Predict Diabetes:")

features = [
    "Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
    "Insulin", "BMI", "Diabetes Pedigree Function", "Age"
]

input_data = []
for feature in features:
    while True:
        try:
            value = float(input(f"{feature}: "))
            input_data.append(value)
            break
        except ValueError:
            print(" Please enter a valid number.")

# Convert to DataFrame with column names to avoid warnings
input_df = pd.DataFrame([input_data], columns=X.columns)
input_scaled = scaler.transform(input_df)

# Make prediction
prediction = model.predict(input_scaled)

# Display result
if prediction[0] == 1:
    print("\n  The person is diabetic.")
else:
    print("\n The person is not diabetic.")


 Enter Patient Details to Predict Diabetes:
Pregnancies: 1
Glucose: 82
BloodPressure: 64
SkinThickness: 13
Insulin: 95
BMI: 21.2
Diabetes Pedigree Function: 0.415
Age: 23

 The person is not diabetic.
