# Normalization Techniques

In [12]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.spatial import distance
from numpy.linalg import matrix_power

### Retrieve Data

In [13]:
df = pd.read_csv('/Users/pavelshaburov/Desktop/ComputerVision/normalization/apples_pears.csv') 
X, y = df.drop('target', axis=1), df['target']
X = X.to_numpy()
unw = X

In [14]:
df

Unnamed: 0,yellowness,symmetry,target
0,0.779427,0.257305,1.0
1,0.777005,0.015915,1.0
2,0.977092,0.304210,1.0
3,0.043032,0.140899,0.0
4,0.760433,0.193123,1.0
...,...,...,...
995,0.288482,0.051186,0.0
996,0.892424,0.795257,0.0
997,0.927219,0.134986,1.0
998,0.015830,0.481322,0.0


### Calculate Mean + Variance + Standard Deviation + Covariance

In [60]:
N = X.shape[0]

#-------mean-------------

def mean(X):
    ux = np.sum(X[:,0])/N
    uy = np.sum(X[:,1])/N
    u = np.array([ux, uy])
    return u

#-------variance---------

ux = np.sum(X[:,0])/N
uy = np.sum(X[:,1])/N
varx = 1/N * np.sum((X[:,0]-ux)**2)
vary = 1/N *np.sum((X[:,1]-uy)**2)


def variance(X):
    varx = 1/(N) * np.sum((X[:,0]-ux)**2)
    vary = 1/(N) *np.sum((X[:,1]-uy)**2)
    var = np.array([varx, vary])
    return var

#-------standard dev------

def standard_dev(X):
    sx = np.sqrt(varx)
    sy = np.sqrt(vary)
    s = np.array([sx, sy])
    return s

#-------covariance------

def covariance(X):
    cov_xy = np.sum((X[:,0]-ux)*(X[:,1]-uy))/N
    return cov_xy

print("mean(x), mean(y): ", mean(X))
print("\nvar(x), var(y): ",variance(X))
print("\ns(x), s(y): ", standard_dev(X))
print("\ncovariance(xy): ", covariance(X))

mean(x), mean(y):  [0.65034828 0.31319579]

var(x), var(y):  [0.07386312 0.07881878]

s(x), s(y):  [0.2717777  0.28074682]

covariance(xy):  -0.02260218265472952


### Covaraince Matrix
    - always a symmatric matrix

In [20]:
def covariance_mat(P):
    Q = P - mean(P)   # recenter qi = xi - mean
    Q = Q.T           # every data point = vector
    return 1/N*np.dot(Q, Q.T)
    
Sigma = covariance_mat(X)
print("MyFunction:\n", Sigma)

np.cov(X.T)

MyFunction:
 [[ 0.07386312 -0.02260218]
 [-0.02260218  0.07881878]]


array([[ 0.07393706, -0.02262481],
       [-0.02262481,  0.07889768]])

### Mahalanobis Distance

In [21]:
# Mahalanobis distance is distance in normalized space

def mahalanobis_dist(p1, p2, P):
    sigma = covariance_mat(P)
    return np.sqrt((p1-p2).T@np.linalg.inv(sigma)@(p1-p2))

print("MyFunction:", mahalanobis_dist(X[0], X[1], X))

distance.mahalanobis(X[0], X[1],np.linalg.inv(Sigma))

MyFunction: 0.9030249724663915


0.9030249724663915

### Normalize Dataset
    - doesn't work!

In [71]:
def normalize(P):
    Q = P - mean(P)                # recenter
    Q = Q/(standard_dev(P)
    return Q

norm_X = normalize(X)

print(covariance(norm_X))
print(covariance(X))

-0.09253865756745594
-0.02260218265472952


In [72]:
print(np.std(X, axis=0))
print(standard_dev(X))

[0.2717777  0.28074682]
[0.2717777  0.28074682]


In [74]:
covariance_mat(norm_X)

array([[ 1.      , -0.296225],
       [-0.296225,  1.      ]])

In [68]:
np.cov(norm_X.T)

array([[ 1.001001  , -0.29652152],
       [-0.29652152,  1.001001  ]])

### Whitening Matrix W

In [23]:
X

array([[0.77942731, 0.25730527],
       [0.77700501, 0.0159154 ],
       [0.97709225, 0.30420962],
       ...,
       [0.92721921, 0.13498618],
       [0.01582991, 0.48132194],
       [0.96796389, 0.30857107]])

In [31]:
norm_X[:,0].var()

13.538556315367044