# Principal Component Analysis

In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [42]:
dataset = pd.read_csv('Pizza.csv')
dataset.head()

Unnamed: 0,brand,id,mois,prot,fat,ash,sodium,carb,cal
0,A,14069,27.82,21.43,44.87,5.11,1.77,0.77,4.93
1,A,14053,28.49,21.26,43.89,5.34,1.79,1.02,4.84
2,A,14025,28.35,19.99,45.78,5.08,1.63,0.8,4.95
3,A,14016,30.55,20.15,43.13,4.79,1.61,1.38,4.74
4,A,14005,30.49,21.28,41.65,4.82,1.64,1.76,4.67


In [43]:
dataset.isnull().values.any()

False

In [44]:
X = dataset.iloc[:, 1:8].values
y = dataset.iloc[:, 0].values

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [46]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Performance of Logistic Regression before PCA

In [47]:
from sklearn.linear_model import LogisticRegression 
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
y_pred = classifier.predict(X_test)

In [49]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred),'\n')
print('Classification Report: \n',classification_report(y_test,y_pred),'\n')
print('Accuracy Score:\n',accuracy_score(y_test,y_pred),'\n')

Confusion Matrix: 
 [[7 0 0 0 0 0 0 0 0 0]
 [0 5 0 0 0 0 0 0 0 0]
 [0 0 5 0 0 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 1 0 2]
 [0 0 0 0 0 1 5 0 0 0]
 [0 0 0 0 0 0 4 1 0 0]
 [0 0 0 0 4 0 0 5 0 2]
 [0 0 0 0 0 0 0 0 3 0]
 [0 0 0 0 0 0 0 0 2 4]] 

Classification Report: 
               precision    recall  f1-score   support

           A       1.00      1.00      1.00         7
           B       1.00      1.00      1.00         5
           C       1.00      1.00      1.00         5
           D       1.00      1.00      1.00         7
           E       0.33      0.40      0.36         5
           F       1.00      0.17      0.29         6
           G       0.44      0.80      0.57         5
           H       0.71      0.45      0.56        11
           I       0.60      1.00      0.75         3
           J       0.50      0.67      0.57         6

    accuracy                           0.72        60
   macro avg       0.76      0.75      0.71        60
weighted avg      

In [50]:
print('Error Rate: ',(1-accuracy_score(y_test,y_pred))*100,"%")

Error Rate:  28.333333333333332 %


# Performance of Logistic Regression on the same dataset after Principal Component Analysis 

In [51]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print('Confusion Matrix: \n',confusion_matrix(y_test,y_pred),'\n')
print('Classification Report: \n',classification_report(y_test,y_pred),'\n')
print('Accuracy Score:\n',accuracy_score(y_test,y_pred),'\n')
print('Error Rate: ',(1-accuracy_score(y_test,y_pred))*100,"%")

Confusion Matrix: 
 [[ 7  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  2  0  0  0  0  0  0]
 [ 0  0  5  0  0  0  0  0  0  0]
 [ 0  0  0  7  0  0  0  0  0  0]
 [ 0  0  0  0  1  1  0  1  2  0]
 [ 0  0  0  0  0  0  6  0  0  0]
 [ 0  0  0  0  0  0  4  1  0  0]
 [ 0  0  0  0  1  0  0 10  0  0]
 [ 0  0  0  0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  0  0  1  5]] 

Classification Report: 
               precision    recall  f1-score   support

           A       1.00      1.00      1.00         7
           B       1.00      0.60      0.75         5
           C       1.00      1.00      1.00         5
           D       0.78      1.00      0.88         7
           E       0.50      0.20      0.29         5
           F       0.00      0.00      0.00         6
           G       0.40      0.80      0.53         5
           H       0.83      0.91      0.87        11
           I       0.50      1.00      0.67         3
           J       1.00      0.83      0.91         6

    accuracy              



# Thus, an improvement in Accuracy from 71.66% to 75% was noted in Logistic Regression after using Principal Component Analysis