In [None]:
import gower
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.stats import chi2
import matplotlib.pyplot as plt
import os
import sys

In [None]:
#import custom modules
sys.path.append(os.path.abspath(os.path.join('..','scripts')))
import preprocessing
import utils

In [None]:
df = preprocessing.load_dataset()
dist_matrix = gower.gower_matrix(df)

In [None]:
# pass from distance to covariance/similarity matrix (see linked paper)
dist_matrix = np.ones(dist_matrix.shape) - (dist_matrix)

In [None]:
## will contain tuples (num_of_components, explained_variance)
# care in needs 2 minute2 to run
explained_variances = [] 
for n_components in range(0, 20):
    pca = PCA(n_components=n_components)
    pca.fit(dist_matrix)
    explained_variances.append((n_components, (np.sum(pca.explained_variance_ratio_))))

In [None]:
explained_variances

In [None]:
# We take the minimum number of components that explains at least 98% of the variance
threshold_variance = 0.98
optimal_n_components = -1
for i,v in explained_variances:
    if v > threshold_variance:
        optimal_n_components = i
        break
print("Optimal # of components: ", optimal_n_components)

plt.plot([var for (_, var) in explained_variances] )
plt.xlabel('number of components')
plt.ylabel('explained variance')
plt.show()

NCOMP = optimal_n_components

In [None]:
pca = PCA(n_components=NCOMP)
# fit and transform the data
pca_result = pca.fit_transform(dist_matrix)
# set the alpha value to indicate the percentil of the chi-squared distribution
alpha = 0.99
# compute chi-squared for given alpha and degrees of freedom
chi_2 = chi2.ppf(alpha, df=NCOMP)

# eigenvalues of the covariance matrix
lambdas = np.sqrt(pca.explained_variance_)

## we take the sum of the squared coordinates divided by the eigenvalues
## if it's greater than the chi2 value we consider it an outlier
outlier_indices = -(1 * (np.sum((pca_result ** 2) / np.transpose(lambdas), axis=1) > chi_2))

In [None]:
sum(outlier_indices)

In [None]:
utils.plot_TSNE_2(dist_matrix= gower.gower_matrix(df), labels= np.array(outlier_indices) * -1)