# Pipeline

In [1]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import sys
    
from pathlib import Path

path = (
    Path
    .cwd()
    .parent
    .parent
    .joinpath('project')
)

sys.path.append(path)

In [2]:
from __future__ import annotations

import numpy as np
import pandas as pd
import sympy as sp

from project.reduction import (
    DimensionalityReduction,
    PCAStrategy
)
from project.visualizer.pca import (
    MatplotlibVisualizer,
    PCAVisualizer,
    PlotlyVisualizer
)
from project.visualizer.settings import Settings
from sklearn.datasets import (
    load_digits,
    load_iris,
    load_wine
)
from sklearn.preprocessing import StandardScaler

In [3]:
scale = True

loader = load_iris()

columns, data, target = (
    loader.feature_names,
    loader.data,
    loader.target
)

dataframe = pd.DataFrame(columns=columns, data=data)

In [4]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [5]:
scaler = StandardScaler()
scale = scaler.fit_transform(dataframe)

In [6]:
dataframe.loc[:, dataframe.columns] = scale

In [19]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [7]:
covariance = np.cov(dataframe.T)

In [8]:
matrix = sp.Matrix(covariance)
eigenvector  = matrix.eigenvects()

eigenvector

[(2.93808505020000,
  1,
  [Matrix([
   [ -0.52106591467012],
   [ 0.269347442505943],
   [-0.580413095796294],
   [-0.564856535779361]])]),
 (0.920164904162488,
  1,
  [Matrix([
   [ -0.377417615564567],
   [ -0.923295659540715],
   [-0.0244916090855863],
   [-0.0669419869680587]])]),
 (0.0208538621764622,
  1,
  [Matrix([
   [ 0.261286279952452],
   [-0.123509619585519],
   [-0.801449246335988],
   [ 0.523597134566191]])]),
 (0.147741821044948,
  1,
  [Matrix([
   [ 0.719566352700817],
   [-0.244381779514399],
   [-0.142126369333904],
   [-0.634272737110923]])])]

In [9]:
eigenvalues, eigenvectors = np.linalg.eig(covariance)

In [10]:
eigenvalues

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

In [11]:
eigenvectors

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

In [12]:
# Sort the eigenvectors based on eigenvalues
indices = np.argsort(eigenvalues)[::-1]

indices

array([0, 1, 2, 3], dtype=int64)

In [13]:
eigenvectors_sorted = eigenvectors[:, indices]

eigenvectors_sorted

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

In [14]:
eigenvalues_sorted = eigenvalues[indices]

eigenvalues_sorted

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

In [15]:
# Choose the top 2 eigenvectors
k = 2
top_k_eigenvectors = eigenvectors_sorted[:, :k]

top_k_eigenvectors

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [22]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [20]:
dataframe.shape

(150, 4)

In [21]:
top_k_eigenvectors.shape

(4, 2)

In [16]:
# Transform the data
transform = np.dot(
    dataframe, 
    top_k_eigenvectors
)

transform

array([[-2.26470281, -0.4800266 ],
       [-2.08096115,  0.67413356],
       [-2.36422905,  0.34190802],
       [-2.29938422,  0.59739451],
       [-2.38984217, -0.64683538],
       [-2.07563095, -1.48917752],
       [-2.44402884, -0.0476442 ],
       [-2.23284716, -0.22314807],
       [-2.33464048,  1.11532768],
       [-2.18432817,  0.46901356],
       [-2.1663101 , -1.04369065],
       [-2.32613087, -0.13307834],
       [-2.2184509 ,  0.72867617],
       [-2.6331007 ,  0.96150673],
       [-2.1987406 , -1.86005711],
       [-2.26221453, -2.68628449],
       [-2.2075877 , -1.48360936],
       [-2.19034951, -0.48883832],
       [-1.898572  , -1.40501879],
       [-2.34336905, -1.12784938],
       [-1.914323  , -0.40885571],
       [-2.20701284, -0.92412143],
       [-2.7743447 , -0.45834367],
       [-1.81866953, -0.08555853],
       [-2.22716331, -0.13725446],
       [-1.95184633,  0.62561859],
       [-2.05115137, -0.24216355],
       [-2.16857717, -0.52714953],
       [-2.13956345,

In [23]:
rounded = np.round(transformation, decimals=2)
rounded

array([[-2.26,  0.48],
       [-2.08, -0.67],
       [-2.36, -0.34],
       [-2.3 , -0.6 ],
       [-2.39,  0.65],
       [-2.08,  1.49],
       [-2.44,  0.05],
       [-2.23,  0.22],
       [-2.33, -1.12],
       [-2.18, -0.47],
       [-2.17,  1.04],
       [-2.33,  0.13],
       [-2.22, -0.73],
       [-2.63, -0.96],
       [-2.2 ,  1.86],
       [-2.26,  2.69],
       [-2.21,  1.48],
       [-2.19,  0.49],
       [-1.9 ,  1.41],
       [-2.34,  1.13],
       [-1.91,  0.41],
       [-2.21,  0.92],
       [-2.77,  0.46],
       [-1.82,  0.09],
       [-2.23,  0.14],
       [-1.95, -0.63],
       [-2.05,  0.24],
       [-2.17,  0.53],
       [-2.14,  0.31],
       [-2.27, -0.34],
       [-2.14, -0.5 ],
       [-1.83,  0.42],
       [-2.61,  1.79],
       [-2.45,  2.15],
       [-2.11, -0.46],
       [-2.21, -0.21],
       [-2.05,  0.66],
       [-2.53,  0.59],
       [-2.43, -0.9 ],
       [-2.17,  0.27],
       [-2.29,  0.44],
       [-1.86, -2.34],
       [-2.55, -0.48],
       [-1.

In [17]:
target = loader.target

dataset = dataframe.to_numpy()

strategy = PCAStrategy(dataframe=dataframe, dataset=dataset, target=target)
reduction = DimensionalityReduction(strategy=strategy)

n_components = 2
transformation = reduction.reduce(n_components=n_components)

In [18]:
transformation

array([[-2.26470281,  0.4800266 ],
       [-2.08096115, -0.67413356],
       [-2.36422905, -0.34190802],
       [-2.29938422, -0.59739451],
       [-2.38984217,  0.64683538],
       [-2.07563095,  1.48917752],
       [-2.44402884,  0.0476442 ],
       [-2.23284716,  0.22314807],
       [-2.33464048, -1.11532768],
       [-2.18432817, -0.46901356],
       [-2.1663101 ,  1.04369065],
       [-2.32613087,  0.13307834],
       [-2.2184509 , -0.72867617],
       [-2.6331007 , -0.96150673],
       [-2.1987406 ,  1.86005711],
       [-2.26221453,  2.68628449],
       [-2.2075877 ,  1.48360936],
       [-2.19034951,  0.48883832],
       [-1.898572  ,  1.40501879],
       [-2.34336905,  1.12784938],
       [-1.914323  ,  0.40885571],
       [-2.20701284,  0.92412143],
       [-2.7743447 ,  0.45834367],
       [-1.81866953,  0.08555853],
       [-2.22716331,  0.13725446],
       [-1.95184633, -0.62561859],
       [-2.05115137,  0.24216355],
       [-2.16857717,  0.52714953],
       [-2.13956345,