# 1. Predicting Diabetes Onset Using Logistic Regression

Importing the libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

Importing the libraries

In [2]:
data = pd.read_csv('./datasets/diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Preprocessing

Replace zeros with NaN in specified columns

In [7]:
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[columns_with_zeros] = data[columns_with_zeros].replace(0, np.nan)

Impute missing values with the median

In [8]:
imputer = SimpleImputer(strategy='median')
data[columns_with_zeros] = imputer.fit_transform(data[columns_with_zeros])

Standardize features

In [9]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.drop('Outcome', axis=1)), columns=data.columns[:-1])

Add the target variable back to the scaled data

In [10]:
data_scaled['Outcome'] = data['Outcome']

Preprocessed Data

In [11]:
data_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.468492,1.425995,1
1,-0.844885,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,-0.365061,-0.190672,0
2,1.23388,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.604397,-0.105584,1
3,-0.844885,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,-0.920763,-1.041549,0
4,-1.141852,0.504422,-2.679076,0.670643,0.316566,1.549303,5.484909,-0.020496,1


# Implement Logistic Regression

Split the dataset into training and testing sets

In [13]:
X = data_scaled.drop('Outcome', axis=1)
y = data_scaled['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the logistic regression model

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model

Predict on the test set

In [15]:
y_pred = model.predict(X_test)

Calculate evaluation metrices

In [17]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

Accuracy: 0.75
Precision: 0.67
Recall: 0.62
