# Pipeline

In [1]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import sys
    
from pathlib import Path

path = (
    Path
    .cwd()
    .parent
    .parent
    .joinpath('project')
)

sys.path.append(path)

In [2]:
from __future__ import annotations

import numpy as np
import pandas as pd
import sympy as sp

from project.reduction import (
    DimensionalityReduction,
    PCAStrategy
)
from project.visualizer.pca import (
    MatplotlibVisualizer,
    PCAVisualizer,
    PlotlyVisualizer
)
from project.visualizer.settings import Settings
from sklearn.datasets import (
    load_digits,
    load_iris,
    load_wine
)
from sklearn.preprocessing import StandardScaler

In [3]:
scale = True

loader = load_iris()

columns, data, target = (
    loader.feature_names,
    loader.data,
    loader.target
)

dataframe = pd.DataFrame(columns=columns, data=data)

In [4]:
subset = dataframe.tail()
dataframe = subset.copy()

In [5]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [6]:
scaler = StandardScaler()
scale = scaler.fit_transform(dataframe)

In [7]:
dataframe.loc[:, dataframe.columns] = scale

In [8]:
covariance = np.cov(dataframe.T)

In [9]:
matrix = sp.Matrix(covariance)
eigenvector  = matrix.eigenvects()

eigenvector

[(3.27976032210976,
  1,
  [Matrix([
   [-0.214555181281587],
   [-0.536847114115085],
   [-0.594051789835147],
   [-0.559342221940369]])]),
 (1.54434939341819,
  1,
  [Matrix([
   [-0.828989944487641],
   [ 0.415466530763106],
   [ 0.213512469539301],
   [-0.307531557901422]])]),
 (0.0344591710500394,
  1,
  [Matrix([
   [-0.0137755767979422],
   [ -0.596593614896672],
   [  0.766786442600076],
   [  -0.23648434111612]])]),
 (0.141431113422006,
  1,
  [Matrix([
   [ 0.516286722284707],
   [ 0.428087370225396],
   [ 0.116419275850727],
   [-0.732554281986263]])])]

In [10]:
eigenvalues, eigenvectors = np.linalg.eig(covariance)

In [11]:
eigenvalues

array([3.27976032, 1.54434939, 0.14143111, 0.03445917])

In [12]:
eigenvectors

array([[-0.21455518,  0.82898994, -0.51628672,  0.01377558],
       [-0.53684711, -0.41546653, -0.42808737,  0.59659361],
       [-0.59405179, -0.21351247, -0.11641928, -0.76678644],
       [-0.55934222,  0.30753156,  0.73255428,  0.23648434]])

In [13]:
# Sort the eigenvectors based on eigenvalues
indices = np.argsort(eigenvalues)[::-1]

indices

array([0, 1, 2, 3], dtype=int64)

In [14]:
eigenvectors_sorted = eigenvectors[:, indices]

eigenvectors_sorted

array([[-0.21455518,  0.82898994, -0.51628672,  0.01377558],
       [-0.53684711, -0.41546653, -0.42808737,  0.59659361],
       [-0.59405179, -0.21351247, -0.11641928, -0.76678644],
       [-0.55934222,  0.30753156,  0.73255428,  0.23648434]])

In [15]:
eigenvalues_sorted = eigenvalues[indices]

eigenvalues_sorted

array([3.27976032, 1.54434939, 0.14143111, 0.03445917])

In [16]:
# Choose the top 2 eigenvectors
k = 2
top_k_eigenvectors = eigenvectors_sorted[:, :k]

top_k_eigenvectors

array([[-0.21455518,  0.82898994],
       [-0.53684711, -0.41546653],
       [-0.59405179, -0.21351247],
       [-0.55934222,  0.30753156]])

In [17]:
# Transform the data
transform = np.dot(
    dataframe, 
    top_k_eigenvectors
)

transform

array([[-1.07960806,  1.45832961],
       [ 2.15853585,  0.68774256],
       [-0.1065141 ,  0.39913854],
       [-2.33148071, -0.97317079],
       [ 1.35906702, -1.57203992]])

In [18]:
target = loader.target

dataset = dataframe.to_numpy()

strategy = PCAStrategy(dataframe=dataframe, dataset=dataset, target=target)
reduction = DimensionalityReduction(strategy=strategy)

n_components = 2
transformation = reduction.reduce(n_components=n_components)

In [19]:
transformation

array([[ 1.07960806, -1.45832961],
       [-2.15853585, -0.68774256],
       [ 0.1065141 , -0.39913854],
       [ 2.33148071,  0.97317079],
       [-1.35906702,  1.57203992]])