# By using Numpy

In [1]:
import numpy as np

In [2]:
#data matrix for pca
data=np.array([[1,2],[3,4],[5,6]])
data

array([[1, 2],
       [3, 4],
       [5, 6]])

In [3]:
#mean of each column
M=np.mean(data.T,axis=1)
M

array([3., 4.])

In [4]:
#subtract the mean from data
scaled_data=data-M
scaled_data

array([[-2., -2.],
       [ 0.,  0.],
       [ 2.,  2.]])

In [5]:
#covariance Matrixs on scaled data
V=np.cov(scaled_data.T)
V

array([[4., 4.],
       [4., 4.]])

In [6]:
# eigen values and eigen vectors
values,vectors=np.linalg.eig(V)

In [7]:
#eigen values which is explained variance of components
values

array([8., 0.])

In [8]:
#eigen vectors which are principle components
vectors

array([[ 0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678]])

In [9]:
vectors.T

array([[ 0.70710678,  0.70710678],
       [-0.70710678,  0.70710678]])

In [10]:
scaled_data.T

array([[-2.,  0.,  2.],
       [-2.,  0.,  2.]])

In [11]:
#project the data /transform the data 
p=vectors.T.dot(scaled_data.T)

In [12]:
p.T

array([[-2.82842712,  0.        ],
       [ 0.        ,  0.        ],
       [ 2.82842712,  0.        ]])

# By using Scikit Learn

In [13]:
from sklearn.decomposition import PCA
pca=PCA()
pca.fit(data)

PCA()

In [14]:
pca.components_

array([[ 0.70710678,  0.70710678],
       [ 0.70710678, -0.70710678]])

In [15]:
pca.explained_variance_

array([8.00000000e+00, 2.25080839e-33])

In [16]:
p=pca.transform(data)
p

array([[-2.82842712e+00,  2.22044605e-16],
       [ 0.00000000e+00,  0.00000000e+00],
       [ 2.82842712e+00, -2.22044605e-16]])

# PCA on Iris Data

In [17]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns 

In [None]:
df = pd.read_csv("../Data/iris.csv")

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
le = LabelEncoder()
df["species"] = le.fit_transform(df["species"])

In [None]:
x = df.drop(columns = ["species"])
y = df["species"]

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)

In [None]:
sc=StandardScaler()
xtrain=sc.fit_transform(xtrain)
xtest=sc.fit_transform(xtest)

In [None]:
from sklearn.decomposition import PCA
pca=PCA()
xtrain=pca.fit_transform(xtrain)
xtest=pca.fit_transform(xtest)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print("Accuracy is :",accuracy_score(ytest,ypred))
print(classification_report(ytest,ypred))

In [None]:
pca.explained_variance_ 

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.plot(pca.explained_variance_ratio_,marker="o")
plt.xticks([0,1,2,3],["PC1","PC2","PC3","PC4"])
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance Ratio")
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=3)
xtrain=pca.fit_transform(xtrain)
xtest=pca.fit_transform(xtest)
model=LogisticRegression()
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)
from sklearn.metrics import accuracy_score,classification_report
print("Accuracy is :",accuracy_score(ytest,ypred))
print(classification_report(ytest,ypred))