In [None]:
# Step 1: Import Libraries
# We import the necessary libraries for data processing and building the logistic regression model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Load Dataset
# We’re using the Pima Indians Diabetes dataset, a well-known dataset for predicting diabetes diagnoses.
# Each row represents a patient's medical information, and the target column 'Outcome' indicates
# whether the patient was diagnosed with diabetes (1) or not (0).
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=columns)

# Display the first few rows of the dataset to understand its structure
data.head()

# Step 3: Data Preprocessing
# Split the data into features (X) and target labels (y).
# - 'Outcome' is the target variable (1 = Diabetic, 0 = Non-Diabetic).
X = data.drop('Outcome', axis=1).values
y = data['Outcome'].values

# Normalize the features
# Standardizing the features helps improve model performance by giving each feature the same scale.
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build and Train the Logistic Regression Model
# Logistic regression is commonly used for binary classification tasks.
# Here, we set `solver='liblinear'`, which is a good choice for small datasets and binary classification.
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Step 5: Evaluate the Model
# After training, we use the model to predict outcomes on the test set and evaluate its performance.
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Classification report for detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix to visualize true positives, false positives, etc.
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# Step 6: Make Sample Prediction
# We'll select a single sample from the test set, make a prediction, and compare it to the actual label.
sample_index = 0  # Choosing a sample index from the test set
sample = X_test[sample_index].reshape(1, -1)  # Reshape for a single prediction
prediction = model.predict(sample)
print("\nSample Prediction (Diagnosis):", "Diabetic" if prediction[0] == 1 else "Non-Diabetic")

# Print actual label for comparison
print("Actual Diagnosis:", "Diabetic" if y_test[sample_index] == 1 else "Non-Diabetic")


Test Accuracy: 0.7532467532467533

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154


Confusion Matrix:
 [[79 20]
 [18 37]]

Sample Prediction (Diagnosis): Non-Diabetic
Actual Diagnosis: Non-Diabetic
