# PCA Example

## Imports and Data Load

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### Checking for NAN values

In [32]:
df.isna().any()

Age                  False
RestingBP            False
Cholesterol          False
FastingBS            False
MaxHR                False
Oldpeak              False
HeartDisease         False
Sex_M                False
ChestPainType_ATA    False
ChestPainType_NAP    False
ChestPainType_TA     False
RestingECG_Normal    False
RestingECG_ST        False
ExerciseAngina_Y     False
ST_Slope_Flat        False
ST_Slope_Up          False
dtype: bool

## Preparing Data for training

In [6]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [7]:
df.shape

(918, 16)

In [11]:
x = df.drop('HeartDisease', axis='columns')
y = df.HeartDisease

### Scaling

In [13]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

### Train Test Split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y , train_size=0.8)

## Selecting Models to Train

In [26]:
models =[SVC(), LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), \
         GaussianNB(), LinearRegression()]

Note: Multinomial NB does not accept negative values, and Standard scaler has scaled our data
from -1 to 1. So if you want to use Multinomial NB, use a different scaler.

In [20]:
x

array([[-1.4331398 ,  0.41090889,  0.82507026, ..., -0.8235563 ,
        -1.00218103,  1.15067399],
       [-0.47848359,  1.49175234, -0.17196105, ..., -0.8235563 ,
         0.99782372, -0.86905588],
       [-1.75135854, -0.12951283,  0.7701878 , ..., -0.8235563 ,
        -1.00218103,  1.15067399],
       ...,
       [ 0.37009972, -0.12951283, -0.62016778, ...,  1.21424608,
         0.99782372, -0.86905588],
       [ 0.37009972, -0.12951283,  0.34027522, ..., -0.8235563 ,
         0.99782372, -0.86905588],
       [-1.64528563,  0.30282455, -0.21769643, ..., -0.8235563 ,
        -1.00218103,  1.15067399]], shape=(918, 15))

## Training without PCA

In [27]:
for model in models:
    model.fit(x_train, y_train)
    print(f'{model} : {model.score(x_test, y_test)}')

SVC() : 0.875
LogisticRegression() : 0.8586956521739131
DecisionTreeClassifier() : 0.8315217391304348
RandomForestClassifier() : 0.875
GaussianNB() : 0.875
LinearRegression() : 0.526382473065343


## Training Using PCA

In [29]:
pca = PCA(0.95)
x_pca = pca.fit_transform(x)

In [33]:
x_pca.shape

(918, 13)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_pca, y , train_size=0.8)

In [31]:
for model in models:
    model.fit(x_train, y_train)
    print(f'{model} : {model.score(x_test, y_test)}')

SVC() : 0.8532608695652174
LogisticRegression() : 0.8206521739130435
DecisionTreeClassifier() : 0.7934782608695652
RandomForestClassifier() : 0.875
GaussianNB() : 0.8369565217391305
LinearRegression() : 0.5069680564646528


Accuracy is dropped a bit, but time to train was much less