# Covariance

In [1]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import sys
    
from pathlib import Path

path = (
    Path
    .cwd()
    .parent
    .parent
    .joinpath('project')
)

sys.path.append(path)

In [2]:
from __future__ import annotations

import numpy as np
import pandas as pd

from project.reduction import (
    DimensionalityReduction,
    PCAStrategy
)
from project.visualizer.pca import (
    MatplotlibVisualizer,
    PCAVisualizer,
    PlotlyVisualizer
)
from project.visualizer.settings import Settings
from sklearn.datasets import (
    load_digits,
    load_iris,
    load_wine
)
from sklearn.preprocessing import StandardScaler

In [3]:
scale = True

loader = load_iris()

columns, data, target = (
    loader.feature_names,
    loader.data,
    loader.target
)

dataframe = pd.DataFrame(columns=columns, data=data)

In [4]:
dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In Python, the standard deviation is calculated using the Bessel's correction, where the sum of squared differences is divided by $n−1$ instead of $n$ (where $n$ is the number of data points). However, in the case of the `StandardScaler`, the standard deviation is calculated without this correction. The `ddof` (Delta Degrees of Freedom) argument is set to 0 to match the behavior of `StandardScaler`

In [5]:
mean = dataframe['sepal length (cm)'].mean()
std = dataframe['sepal length (cm)'].std(ddof=0)

print(round(mean, 2), round(std, 2))

5.84 0.83


In [6]:
mean = dataframe['sepal width (cm)'].mean()
std = dataframe['sepal width (cm)'].std(ddof=0)

print(round(mean, 2), round(std, 2))

3.06 0.43


In [7]:
mean = dataframe['petal length (cm)'].mean()
std = dataframe['petal length (cm)'].std(ddof=0)

print(round(mean, 2), round(std, 2))

3.76 1.76


In [8]:
mean = dataframe['petal width (cm)'].mean()
std = dataframe['petal width (cm)'].std(ddof=0)

print(round(mean, 2), round(std, 2))

1.2 0.76


In [9]:
mean = dataframe['petal width (cm)'].mean()
std = dataframe['petal width (cm)'].std(ddof=0)

point = dataframe['sepal length (cm)'].iloc[0]

scale = (point - mean) / std
print(scale)

5.134532735217001


In [10]:
scaler = StandardScaler()
scale = scaler.fit_transform(dataframe)

point = scale[0][0]
print(point)

-0.9006811702978099


In [11]:
dataframe.loc[:, dataframe.columns] = scale

dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [12]:
rounded = dataframe.applymap(lambda x: round(x, 2))

rounded

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.90,1.02,-1.34,-1.32
1,-1.14,-0.13,-1.34,-1.32
2,-1.39,0.33,-1.40,-1.32
3,-1.51,0.10,-1.28,-1.32
4,-1.02,1.25,-1.34,-1.32
...,...,...,...,...
145,1.04,-0.13,0.82,1.45
146,0.55,-1.28,0.71,0.92
147,0.80,-0.13,0.82,1.05
148,0.43,0.79,0.93,1.45


In [13]:
# NumPy:
covariance = np.cov(scale[:, 0], scale[:, 1])[0, 1]
print(f"NumPy: {covariance}")

# Manual
x = dataframe['sepal length (cm)'].tolist()
sample = len(x)

x_col = dataframe.iloc[:, 0]
y_col = dataframe.iloc[:, 1]
x_mean = x_col.mean()
y_mean = y_col.mean()

covariance = ((x_col - x_mean) * (y_col - y_mean)).sum() / (sample - 1)
print(f"Manual: {covariance}")

NumPy: -0.11835884308691488
Manual: -0.11835884308691487


In [14]:
covariance = np.cov(dataframe.T)
covariance

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [15]:
rounded = np.round(covariance, decimals=2)
rounded

array([[ 1.01, -0.12,  0.88,  0.82],
       [-0.12,  1.01, -0.43, -0.37],
       [ 0.88, -0.43,  1.01,  0.97],
       [ 0.82, -0.37,  0.97,  1.01]])