# Kernel PCA (Principal Component Analysis)

In [14]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.decomposition import KernelPCA

## Loading Dataset

In [15]:
dataset = pd.read_csv("../../../Datasets/drink.csv", sep=",")
dataset.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [16]:
samples_count, features_count = dataset.shape
samples_count, features_count

(178, 14)

## Separating target column from dataset

In [17]:
target = dataset.iloc[:, -1].values
dataset = dataset.iloc[:, :-1].values

## Train & Test split

In [18]:
train_dataset, test_dataset, train_target, test_target = train_test_split(dataset, target, test_size=0.2, random_state=0)

## Feature Scaling

In [19]:
sc = StandardScaler()
train_dataset = sc.fit_transform(train_dataset)
test_dataset = sc.transform(test_dataset)

## Applying Kernal PCA

In [27]:
kernel_pca = KernelPCA(n_components=2, kernel='rbf')  # n_components = no of final attributes of dataset
train_dataset = kernel_pca.fit_transform(train_dataset)
test_dataset = kernel_pca.transform(test_dataset)
train_dataset[0]

array([-0.41449225, -0.3945542 ])

## Logistic Regression Training

In [28]:
lr = LogisticRegression(random_state=0)
lr.fit(train_dataset, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Check Accuracy
- accuracy without PCA is 0.93

In [29]:
predictions = lr.predict(test_dataset)
accuracy = np.sum(predictions==test_target) / float(len(test_target))
print('%.2f' % accuracy)

0.97


## Confusion Matrix
- It is used to specify correct and wrong predictions

In [30]:
cm = confusion_matrix(test_target, predictions)
cm

array([[14,  0,  0],
       [ 1, 15,  0],
       [ 0,  0,  6]])