Principal Component Analysis (PCA) is directly related to variance. In PCA, the goal is to find a new set of variables (principal components) that capture the maximum variance in the data. It identifies the directions (principal components) along which the data varies the most. The first principal component captures the most variance, followed by the second, and so on. PCA essentially reorients the data to highlight its highest variance, making it a useful technique for dimensionality reduction and feature extraction.







In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets

In [None]:
iris=datasets.load_iris()

In [None]:
iris.data

In [None]:
x=StandardScaler().fit_transform(iris.data) #It computes the mean and standard deviation of each feature in the dataset and then scales each feature so that they have a mean of 0 and a standard deviation of 1

In [None]:
# Create a PCA that will retain 99% of the variance
pca=PCA(n_components=0.95,whiten=True)
x_pca=pca.fit_transform(x)

In [None]:
print("original features:",x.shape)
print("after pca features:",x_pca)

In [None]:
x[0:5]

In [None]:
x_pca[0:5]

In [None]:
import matplotlib.pyplot  as plt
import seaborn as sns;sns.set()

In [None]:
rng=np.random.RandomState(1)
x=np.dot(rng.rand(2,2),rng.randn(2,250)).T
plt.scatter(x[:,0],x[:,1])
plt.axis('equal')


In [None]:
pca=PCA(n_components=2)
pca.fit(x)

In [None]:
print(pca.components_)

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
plt.scatter(x[:, 0], x[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

In [None]:
pca=PCA(n_components=1)
pca.fit(x)
x_pca=pca.transform(x)
print("ORIGINAL SHAPE:",x.shape)
print("AFTER PCA SHAPE:",x_pca.shape)

In [None]:
x_new = pca.inverse_transform(x_pca)
plt.scatter(x[:, 0], x[:, 1], alpha=0.2)
plt.scatter(x_new[:, 0], x_new[:, 1], alpha=0.8)
plt.axis('equal');

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.score(X, y)

In [None]:
pca = PCA(n_components=0.95, whiten=True)
X_pca = pca.fit_transform(x)

In [None]:
X_pca[0]

In [None]:
clf = LogisticRegression(random_state=0).fit(X_pca, y)
clf.score(X_pca, y)