In [50]:
# Import the required libraries and classifiers from the sklearn
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler as ss
from sklearn.decomposition import PCA

# read file
data = pd.read_csv('breast-cancer-data.csv')

print(data.shape)
data.head(10)

(569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [51]:
# drop columns which will be not part of the training/test model
df=data.drop(['id'],axis=1)
df.shape

(569, 31)

In [52]:
# convert categorical data into numeric
df['diagnosis'].replace('M',1,inplace=True)
df['diagnosis'].replace('B',0,inplace=True)

In [53]:
# Separate the predictors and target. 
# column 'diagnosis' is the target and rest other 30 columns are act as predictors(features)
X = df.loc[: , 'radius_mean':'fractal_dimension_worst']
y = df.loc[:, 'diagnosis']

In [54]:
# Scale all numerical features in X
scale = ss()
X = scale.fit_transform(X)

In [55]:
# Performing PCA on numeric columns
pca = PCA()
out = pca.fit_transform(X)
out.shape 

(569, 30)

In [56]:
# Calculate the cumulative sum of each column
# To decide that how many PCs (Principal components) we need to consider to get desired variance
pca.explained_variance_ratio_.cumsum()

array([0.44272026, 0.63243208, 0.72636371, 0.79238506, 0.84734274,
       0.88758796, 0.9100953 , 0.92598254, 0.93987903, 0.95156881,
       0.961366  , 0.97007138, 0.97811663, 0.98335029, 0.98648812,
       0.98915022, 0.99113018, 0.99288414, 0.9945334 , 0.99557204,
       0.99657114, 0.99748579, 0.99829715, 0.99889898, 0.99941502,
       0.99968761, 0.99991763, 0.99997061, 0.99999557, 1.        ])

In [57]:
# Assign the first 10 columns of the 'out' to final_data
final_data = out[:, :10]
final_data.shape

(569, 10)

In [58]:
# create the new Dataframe with 10 PCs as predictors and with target column.
pcdf = pd.DataFrame( data =  final_data,
                    columns = ['pc1', 'pc2','pc3', 'pc4','pc5','pc6','pc7','pc8','pc9','pc10'])

pcdf['target'] = data['diagnosis'].map({'M': 1, "B" : 0 })

pcdf.head(10)

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,target
0,9.192837,1.948583,-1.123166,3.633731,-1.19511,1.411424,2.15937,-0.398407,-0.157118,-0.877402,1
1,2.387802,-3.768172,-0.529293,1.118264,0.621775,0.028656,0.013358,0.240988,-0.711905,1.106995,1
2,5.733896,-1.075174,-0.551748,0.912083,-0.177086,0.541452,-0.668166,0.097374,0.024066,0.454275,1
3,7.122953,10.275589,-3.23279,0.152547,-2.960878,3.053422,1.429911,1.059565,-1.40544,-1.116975,1
4,3.935302,-1.948072,1.389767,2.940639,0.546747,-1.226495,-0.936213,0.636376,-0.263805,0.377704,1
5,2.380247,3.949929,-2.934877,0.941037,-1.056042,-0.451039,0.490445,-0.165444,-0.133473,-0.530431,1
6,2.238883,-2.690031,-1.639913,0.14934,0.04036,-0.128948,-0.301567,0.083698,-0.080025,0.219143,1
7,2.143299,2.340244,-0.871947,-0.127043,-1.427437,-1.257039,0.9741,-0.653338,0.248184,1.000586,1
8,3.174924,3.391813,-3.119986,-0.601297,-1.52229,0.559545,-0.215104,-0.687341,0.511924,0.029187,1
9,6.351747,7.727174,-4.341916,-3.375202,1.710263,-0.723909,2.51984,0.365149,-0.717397,-1.165631,1
