In [1]:
# Step 1: Import Libraries
# Import necessary libraries for data processing, model building, and evaluation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 2: Load Dataset
# This dataset is available in sklearn's datasets, making it easy to import
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

# Create a DataFrame for better visualization
X = data.data
y = data.target
df = pd.DataFrame(X, columns=data.feature_names)
df['diagnosis'] = y
df.head()

# Display the first few rows of the dataset to understand its structure
print("Dataset Shape:", df.shape)
print("Target Labels (0 = Malignant, 1 = Benign):", np.unique(y))

# Step 3: Data Preprocessing
# Split the data into features (X) and target labels (y).
# The target variable 'diagnosis' has values 0 (Malignant) and 1 (Benign).
# We normalize the features to ensure efficient model training.
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build and Train the Logistic Regression Model
# Logistic regression is a common method for binary classification tasks
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate the Model
# After training, we use the model to predict outcomes on the test set and evaluate its performance
y_pred = model.predict(X_test)

# Calculate accuracy and display metrics
accuracy = accuracy_score(y_test, y_pred)
print("\nTest Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 6: Make Sample Prediction
# We'll select a single sample from the test set, make a prediction, and compare it to the actual label
sample_index = 0  # Choosing a sample index from the test set
sample = X_test[sample_index].reshape(1, -1)  # Reshape for a single prediction
prediction_prob = model.predict_proba(sample)[:, 1]  # Probability for class 1 (Benign)
prediction = model.predict(sample)
print("\nSample Prediction Probability (Benign):", prediction_prob[0])
print("Sample Prediction (Diagnosis):", "Benign" if prediction[0] == 1 else "Malignant")
print("Actual Diagnosis:", "Benign" if y_test[sample_index] == 1 else "Malignant")


Dataset Shape: (569, 31)
Target Labels (0 = Malignant, 1 = Benign): [0 1]

Test Accuracy: 0.9736842105263158

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Confusion Matrix:
 [[41  2]
 [ 1 70]]

Sample Prediction Probability (Benign): 0.8853456939975518
Sample Prediction (Diagnosis): Benign
Actual Diagnosis: Benign
