In [257]:
import numpy as np
import pandas as pd
import pickle
import sys
import os
import dsc
from dsc.query_engine import Query_Processor as dscQP
from dsc import dsc_io

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils

mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120)
from scipy.spatial import procrustes
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics as skmetrics


def mean_squared_error(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    n = np.sum(mask)
    mse = np.sum(np.square((original - recovered) * mask)) / n
    # mse = np.sum(np.square((original - recovered) * mask))
    return mse


def peak_signal_to_noise_ratio(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    omax = np.max(original[mask == 1])
    omin = np.min(original[mask == 1])
    maxsig2 = np.square(omax - omin)
    mse = mean_squared_error(original, recovered, mask)
    res = 10 * np.log10(maxsig2 / mse)
    return res


def matrix_dissimilarity_scores(original, recovered, mask = None, match = 'zerofill'):
    '''
    Procrustes analysis returns the square of the Frobenius norm.
    Use the rotated matrix to obtain the peak signal-to-noise ratio (PSNR).
    Input matrices can have different dimensions.
    There are two ways to match:
        - clip: remove information from the larger matrix
        - zerofill: pad zero columns in the smaller matrix
    '''
    if mask is None: mask = np.ones_like(original)
    n_mask = np.sum(mask)
    n_orig = original.shape[1]
    n_recv = recovered.shape[1]
    m = original.shape[0]
    if match == 'clip':
        n = min(n_orig, n_recv)
        X = original[:, :n]
        Y = recovered[:, :n]
    elif match == 'zerofill':
        n = max(n_orig, n_recv)
        X = np.zeros((m, n))
        Y = np.zeros((m, n))
        X[:, :n_orig] = original
        Y[:, :n_recv] = recovered
    R_orig, R_recv, m2 = procrustes(X, Y)
    mse = m2 / n_mask
    psnr = peak_signal_to_noise_ratio(R_orig, R_recv, mask)
    return np.sqrt(mse), psnr


def adjusted_mutual_information_score(X, class_labels):
    # X_cent = X.copy()
    X_cent = X - np.mean(X, axis = 0, keepdims = True)
    distance_matrix = skmetrics.pairwise.pairwise_distances(X_cent, metric='euclidean')
    model = AgglomerativeClustering(n_clusters = 5, linkage = 'average', metric = 'precomputed')
    class_pred = model.fit_predict(distance_matrix)
    return skmetrics.adjusted_mutual_info_score(class_labels, class_pred)

In [172]:
sys.path.append("/gpfs/commons/home/sbanerjee/software/GUIDE")
dsc_outdir="/gpfs/commons/home/sbanerjee/simdata/low_rank_matrix_approximation_numerical_experiments/lrma"
data_file = os.path.join(dsc_outdir, "blockdiag", "blockdiag_2.pkl")
nnm_sparse_resfile = os.path.join(dsc_outdir, "truncated_svd", "blockdiag_2_nnm_sparse_1_truncated_svd_1.pkl")

import GUIDE

In [173]:
data  = dsc_io.load_dsc(data_file)
nnm_sparse_res = dsc_io.load_dsc(nnm_sparse_resfile)

In [174]:
data.keys()

dict_keys(['Z', 'Zmask', 'effect_size_obs', 'effect_size_true', 'Ltrue', 'Ftrue', 'Mtrue', 'Ctrue', 'nsample', 'DSC_DEBUG'])

In [175]:
nnm_sparse_res.keys()

dict_keys(['L_est', 'F_est', 'S2', 'DSC_DEBUG'])

In [203]:
Z = data['Z']
Z_cent = Z - np.mean(Z, axis = 0, keepdims = True)
# Z_cent /= np.sqrt(np.prod(Z_cent.shape))

In [219]:
F_est, Lt, Sc, mix = GUIDE.guide(Z_cent.T, L=10, mean_center = True, standardize = True)
L_est = Lt.T



In [220]:
np.square(Sc)

array([51683.6063688 , 27753.84153789, 13667.50667885, 11240.85760548,
       10376.33925674,  9829.37281996,  9066.89005089,  8248.61444404,
        7355.32006102,  6860.93509499])

In [221]:
np.sum(nnm_sparse_res['S2'])

0.6257070555636473

In [222]:
F_est.shape

(2000, 10)

In [223]:
L_est.shape

(200, 10)

## RMSE and PSNR for loadings L

In [224]:
matrix_dissimilarity_scores(data['Ltrue'], L_est)

(0.009812312816604156, 23.78197564382294)

In [225]:
matrix_dissimilarity_scores(data['Ltrue'], nnm_sparse_res['L_est'])

(0.005696756891929372, 28.504849616876662)

## RMSE and PSNR for factors F

In [226]:
matrix_dissimilarity_scores(data['Ftrue'], F_est)

(0.002567745389640889, 27.15887534273335)

In [227]:
matrix_dissimilarity_scores(data['Ftrue'], nnm_sparse_res['F_est'])

(0.002569050783256412, 27.154460721346357)

## RMSE for Z

In [228]:
def Z_rmse(Ztrue, Zrecv):
    Ztrue = Ztrue - np.mean(Ztrue, axis = 0, keepdims = True)
    # Ztrue = Ztrue / np.std(Ztrue)
    Zrecv = Zrecv - np.mean(Zrecv, axis = 0, keepdims = True)
    # Zrecv = Zrecv / np.std(Zrecv)
    Z_rmse = np.sqrt(mean_squared_error(Ztrue, Zrecv))
    Z_psnr = peak_signal_to_noise_ratio(Ztrue, Zrecv)
    return Z_rmse, Z_psnr

Ztrue = data['Ltrue'] @ data['Ftrue'].T
Zrecv = L_est @ F_est.T
Zrecv_nnm_sparse = nnm_sparse_res['L_est'] @ nnm_sparse_res['F_est'].T

Z_rmse(Ztrue, Zrecv)

(0.0065850506314627695, 21.509157973663914)

In [229]:
Z_rmse(Ztrue, Zrecv_nnm_sparse)

(0.0054476877995496675, 23.15609612992844)

In [230]:
matrix_dissimilarity_scores(Ztrue, Zrecv)

(0.0006938352931744471, 28.660892477119774)

In [231]:
matrix_dissimilarity_scores(Ztrue, Zrecv_nnm_sparse)

(0.00040282154290544574, 33.383766450173496)

## Adjusted MI

In [258]:
adjusted_mutual_information_score(L_est, data['Ctrue'])

0.02014091638148678

In [259]:
adjusted_mutual_information_score(nnm_sparse_res['L_est'], data['Ctrue'])

0.5761518049386407