In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# Load the breast cancer dataset
cancer_data = load_breast_cancer()

# Convert the dataset into a DataFrame for easy handling
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)

# The target column (0 = benign, 1 = malignant)
target = cancer_data.target

# View dataset summary
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [3]:
# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Check the scaled data shape
print(scaled_data.shape)


(569, 30)


In [4]:
# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_data)

# Create a DataFrame for the 2 PCA components
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])

# Display the explained variance ratio
print("Explained variance ratio by each component:", pca.explained_variance_ratio_)

# Check PCA DataFrame structure
print(pca_df.head())


Explained variance ratio by each component: [0.44272026 0.18971182]
       PCA1       PCA2
0  9.192837   1.948583
1  2.387802  -3.768172
2  5.733896  -1.075174
3  7.122953  10.275589
4  3.935302  -1.948072


In [5]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(pca_df, target, test_size=0.2, random_state=42)

# Logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")


Logistic Regression Accuracy: 99.12%
