In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.decomposition import PCA

In [2]:
# Load dataset
data = load_breast_cancer()

# Convert to DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# BASIC FEATURE ENGINEERING

In [3]:
# Example: creating new feature (mean of selected columns)
df["mean_radius_mean_texture"] = df["mean radius"] * df["mean texture"]

# Example: log-transform a skewed column
df["log_mean_area"] = np.log(df["mean area"] + 1)

# Example: Binning
#df["radius_bin"] = pd.qcut(df["mean radius"], q=3, labels=["small", "medium", "large"])

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,mean_radius_mean_texture,log_mean_area
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0,186.7362,6.909753
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0,365.5289,7.190676
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0,418.4125,7.093405
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0,232.7396,5.958683
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0,290.9586,7.16858


In [4]:
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# FEATURE SELECTION

In [8]:
# Compute correlation with target
corr_matrix = df.corr(numeric_only=True)["target"].abs()

# Select features with correlation above threshold
corr_threshold = 0.5
selected_corr_features = corr_matrix[corr_matrix > corr_threshold].index.tolist()
selected_corr_features.remove("target")

print(selected_corr_features)
print(f"\n{len(selected_corr_features)} features selected")

['mean radius', 'mean perimeter', 'mean area', 'mean compactness', 'mean concavity', 'mean concave points', 'radius error', 'perimeter error', 'area error', 'worst radius', 'worst perimeter', 'worst area', 'worst compactness', 'worst concavity', 'worst concave points']

15 features selected


In [6]:
df.corr(numeric_only=True)["target"]

mean radius               -0.730029
mean texture              -0.415185
mean perimeter            -0.742636
mean area                 -0.708984
mean smoothness           -0.358560
mean compactness          -0.596534
mean concavity            -0.696360
mean concave points       -0.776614
mean symmetry             -0.330499
mean fractal dimension     0.012838
radius error              -0.567134
texture error              0.008303
perimeter error           -0.556141
area error                -0.548236
smoothness error           0.067016
compactness error         -0.292999
concavity error           -0.253730
concave points error      -0.408042
symmetry error             0.006522
fractal dimension error   -0.077972
worst radius              -0.776454
worst texture             -0.456903
worst perimeter           -0.782914
worst area                -0.733825
worst smoothness          -0.421465
worst compactness         -0.590998
worst concavity           -0.659610
worst concave points      -0

In [6]:
X.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [11]:
selector_chi = SelectKBest(score_func=chi2, k=20)
selector_chi.fit(X_scaled, y)

selected_chi_features = X.columns[selector_chi.get_support()].tolist()

print("Selected features based on Chi-Square:")
print(selected_chi_features)

Selected features based on Chi-Square:
['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean compactness', 'mean concavity', 'mean concave points', 'radius error', 'perimeter error', 'area error', 'concave points error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry']


In [12]:
selector_chi.get_support()

array([ True,  True,  True,  True, False,  True,  True,  True, False,
       False,  True, False,  True,  True, False, False, False,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False])

# PCA – Dimensionality Reduction

In [13]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

print("PCA output shape:", X_pca.shape)

PCA output shape: (569, 5)


In [19]:
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
df_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,1.387021,0.426895,-0.541703,0.048483,0.072198
1,0.462308,-0.556947,-0.205175,-0.042830,-0.016111
2,0.954621,-0.109701,-0.147848,-0.001068,0.033798
3,1.000816,1.525089,-0.053271,-0.207916,0.219381
4,0.626828,-0.302471,-0.409336,0.238811,0.002192
...,...,...,...,...,...
564,1.002840,-0.474785,-0.100041,0.272995,0.083888
565,0.620757,-0.517200,0.400360,0.182443,0.092984
566,0.226311,-0.287946,0.315224,-0.011747,-0.218517
567,1.677834,0.335946,0.296116,-0.156305,-0.070204


In [16]:
print("Explained variance ratio:", pca.explained_variance_ratio_)     #Shows % variance captured by each component
print("Total variance captured:", np.sum(pca.explained_variance_ratio_))

Explained variance ratio: [0.53097689 0.1728349  0.07114442 0.06411259 0.04086072]
Total variance captured: 0.8799295188277029


In [17]:
print("\nOriginal feature count:", X.shape[1])
print("Correlation selected:", len(selected_corr_features))
print("Chi-Square selected:", len(selected_chi_features))
print("PCA components:", X_pca.shape[1])


Original feature count: 30
Correlation selected: 25
Chi-Square selected: 20
PCA components: 5
