# Covariance

In [1]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import sys
    
from pathlib import Path

path = (
    Path
    .cwd()
    .parent
    .parent
    .joinpath('project')
)

sys.path.append(path)

In [2]:
from __future__ import annotations

import numpy as np
import pandas as pd

from project.reduction import (
    DimensionalityReduction,
    PCAStrategy
)
from project.visualizer.pca import (
    MatplotlibVisualizer,
    PCAVisualizer,
    PlotlyVisualizer
)
from project.visualizer.settings import Settings
from sklearn.datasets import (
    load_digits,
    load_iris,
    load_wine
)
from sklearn.preprocessing import StandardScaler

In [3]:
scale = True

loader = load_iris()

columns, data, target = (
    loader.feature_names,
    loader.data,
    loader.target
)

dataframe = pd.DataFrame(columns=columns, data=data)

In [4]:
subset = dataframe.tail()
dataframe = subset.copy()

In [5]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In Python, the standard deviation is calculated using the Bessel's correction, where the sum of squared differences is divided by $n−1$ instead of $n$ (where $n$ is the number of data points). However, in the case of the `StandardScaler`, the standard deviation is calculated without this correction. The `ddof` (Delta Degrees of Freedom) argument is set to 0 to match the behavior of `StandardScaler`

In [6]:
mean = dataframe['sepal length (cm)'].mean()
std = dataframe['sepal length (cm)'].std(ddof=0)

point = dataframe['sepal length (cm)'].iloc[0]

scale = (point - mean) / std
print(scale)

1.4006985834715227


In [7]:
scaler = StandardScaler()
scale = scaler.fit_transform(dataframe)

point = scale[0][0]
print(point)

1.4006985834715227


In [8]:
dataframe.loc[:, dataframe.columns] = scale

dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,1.400699,0.070014,0.150756,1.165543
146,-0.073721,-1.680336,-1.356801,-0.777029
147,0.663489,0.070014,0.150756,-0.291386
148,-0.442326,1.470294,1.658312,1.165543
149,-1.548141,0.070014,-0.603023,-1.262672


In [9]:
# NumPy:
covariance = np.cov(scale[:, 0], scale[:, 1])[0, 1]
print(f"NumPy: {covariance}")

# Manual
x = dataframe['sepal length (cm)'].tolist()
sample = len(x)

x_col = dataframe.iloc[:, 0]
y_col = dataframe.iloc[:, 1]
x_mean = x_col.mean()
y_mean = y_col.mean()

covariance = ((x_col - x_mean) * (y_col - y_mean)).sum() / (sample - 1)
print(f"Manual: {covariance}")

NumPy: -0.1225856456350881
Manual: -0.1225856456350881


In [10]:
covariance = np.cov(dataframe.T)
covariance

array([[ 1.25      , -0.12258565,  0.15281551,  0.73394247],
       [-0.12258565,  1.25      ,  1.17424467,  0.74803974],
       [ 0.15281551,  1.17424467,  1.25      ,  0.9700779 ],
       [ 0.73394247,  0.74803974,  0.9700779 ,  1.25      ]])