# Quantum Entropy (QUE) Scoring & Baselines — Tabular Data Notebook

This notebook contains only code copied verbatim from the authors' repository to compute **QUE scores** and baseline outlier detectors. To use on *any tabular data*:

1. Prepare a tensor `X` (`torch.FloatTensor`) of shape `(n_samples, n_features)` on the appropriate device (`utils.device`).
2. (Optional) Center `X` by subtracting its mean column-wise.
3. Run `compute_tau1_tau0(X, opt)` to obtain QUE (`tau1`) and naive spectral (`tau0`) scores.
4. For baselines, call `l2(X)`, `isolation_forest(X)`, `knn_dist_lof(X)`, `ellenv(X)`.

**Note:** All functions below are pasted verbatim from the authors' `mean.py`, `utils.py`, and `baselines.py`. No new algorithmic code has been added.

In [None]:
# Imports (copied from authors' scripts)
import matplotlib
import torch
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.covariance
import sklearn.cluster
import random
import utils
import pdb
device = utils.device  # from utils.py

In [None]:
# Utilities (copied verbatim from utils.py)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def cov(X):
    #X_mean = X.mean()
    X = X - X.mean(dim=0, keepdim=True)

    cov = torch.mm(X.t(), X) / X.size(0)
    return cov
    
########################


def dominant_eval_cov(X):
    n_data = X.size(0)
    X = X - X.mean(dim=0, keepdim=True)
    X_t = X.t()
    X_t_scaled = X_t/n_data
    n_round = 5
    
    v = torch.randn(X.size(-1), 1, device=X.device)
    for _ in range(n_round):
        v = torch.mm(X_t_scaled, torch.mm(X, v))
        #scale each time instead of at the end to avoid overflow
        #v = v / (v**2).sum().sqrt()
    v = v / (v**2).sum().sqrt()
    mu = torch.mm(v.t(), torch.mm(X_t_scaled, torch.mm(X, v))) / (v**2).sum()
    
    return mu.item(), v.view(-1)
'''
dominant eval of matrix X
Returns: top eval and evec
'''

def pad_to_2power(X):
    n_data, feat_dim = X.size(0), X.size(-1)
    power = int(math.ceil(math.log(feat_dim, 2)))
    power_diff = 2**power-feat_dim
    if power_diff == 0:
        return X
    padding = torch.zeros(n_data, power_diff, dtype=X.dtype, device=X.device)
    X = torch.cat((X, padding), dim=-1)
    
    return X

'''
Find dominant eval of XX^t (and evec in the process) using the power method.
Without explicitly forming XX^t
Returns:
-dominant eval + corresponding eigenvector
'''

def dist_rank(data_x, k, data_y=None, largest=False, opt=None, include_self=False):

    if isinstance(data_x, np.ndarray):
        data_x = torch.from_numpy(data_x)

    if data_y is None:
        data_y = data_x
    else:
        if isinstance(data_y, np.ndarray):
            data_y = torch.from_numpy(data_y)
    k0 = k
    device_o = data_x.device
    data_x = data_x.to(device)
    data_y = data_y.to(device)
    
    (data_x_len, dim) = data_x.size()
    data_y_len = data_y.size(0)
    #break into chunks. 5e6  is total for MNIST point size
    #chunk_sz = int(5e6 // data_y_len)
    chunk_sz = 16384
    chunk_sz = 500 #700 mem error. 1 mil points
    if data_y_len > 990000:
        chunk_sz = 600 #1000 if over 1.1 mil
        #chunk_sz = 500 #1000 if over 1.1 mil 
    else:
        chunk_sz = 3000    

    if k+1 > len(data_y):
        k = len(data_y) - 1
    #if opt is not None and opt.sift:
    
    if device == 'cuda':
        dist_mx = torch.cuda.LongTensor(data_x_len, k+1)
        act_dist = torch.cuda.FloatTensor(data_x_len, k+1)
    else:
        dist_mx = torch.LongTensor(data_x_len, k+1)
        act_dist = torch.cuda.FloatTensor(data_x_len, k+1)
    data_normalized = True if opt is not None and opt.normalize_data else False
    largest = True if largest else (True if data_normalized else False)
    
    #compute l2 dist <--be memory efficient by blocking
    total_chunks = int((data_x_len-1) // chunk_sz) + 1
    y_t = data_y.t()
    if not data_normalized:
        y_norm = (data_y**2).sum(-1).view(1, -1)
    
    for i in range(total_chunks):
        base = i*chunk_sz
        upto = min((i+1)*chunk_sz, data_x_len)
        cur_len = upto-base
        x = data_x[base : upto]
        
        if not data_normalized:
            x_norm = (x**2).sum(-1).view(-1, 1)        
            #plus op broadcasts
            dist = x_norm + y_norm        
            dist -= 2*torch.mm(x, y_t)
        else:
            dist = -torch.mm(x, y_t)
            
        topk_d, topk = torch.topk(dist, k=k+1, dim=1, largest=largest)
                
        dist_mx[base:upto, :k+1] = topk #torch.topk(dist, k=k+1, dim=1, largest=largest)[1][:, 1:]
        act_dist[base:upto, :k+1] = topk_d #torch.topk(dist, k=k+1, dim=1, largest=largest)[1][:, 1:]
        
    topk = dist_mx
    if k > 3 and opt is not None and opt.sift:
        #topk = dist_mx
        #sift contains duplicate points, don't run this in general.
        identity_ranks = torch.LongTensor(range(len(topk))).to(topk.device)
        topk_0 = topk[:, 0]
        topk_1 = topk[:, 1]
        topk_2 = topk[:, 2]
        topk_3 = topk[:, 3]

        id_idx1 = topk_1 == identity_ranks
        id_idx2 = topk_2 == identity_ranks
        id_idx3 = topk_3 == identity_ranks

        if torch.sum(id_idx1).item() > 0:
            topk[id_idx1, 1] = topk_0[id_idx1]

        if torch.sum(id_idx2).item() > 0:
            topk[id_idx2, 2] = topk_0[id_idx2]

        if torch.sum(id_idx3).item() > 0:
            topk[id_idx3, 3] = topk_0[id_idx3]           

    
    if not include_self:
        topk = topk[:, 1:]
        act_dist = act_dist[:, 1:]
    elif topk.size(-1) > k0:
        topk = topk[:, :-1]
    topk = topk.to(device_o)
    return act_dist, topk

class tokenizer:
    """
    Rudimentary tokenizer for when allennlp is unavailable.
    """
    def __init__(self):
        import re
        self.patt = re.compile('[ ;,.?!`\'":|\s~%&*()#$@+-=]')
        
    def batch_tokenize(self, sent_l):
        sent_l2 = []
        for sent in sent_l:            
            sent_l2.append(self.patt.split(sent))
            
        return sent_l2
    
class stop_word_filter:
    
    def filter_words(self, tok_l):
        """
        Input: tok_l: list of tokens
        """
        tok_l2 = []
        for tok in tok_l:
            if tok not in STOP_WORDS:
                tok_l2.append(tok)
        return tok_l2

## This below is due to the authors of spacy, reproduced here as some users have ##
## reported difficulties installing the language packages required for processig text ##

# Stop words
STOP_WORDS = set(
    """
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full
further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred
i if in indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves
out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such
take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split()
)

contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)

for apostrophe in ["‘", "’"]:
    for stopword in contractions:
        STOP_WORDS.add(stopword.replace("'", apostrophe))

def inner_mx(mx1, mx2):    
    return torch.mm(mx1 * mx2.t())


'''
Input: lines is list of objects, not newline-terminated yet. 
'''

In [None]:
# QUE core (copied verbatim from mean.py)
def compute_m(X, lamb, noise_vecs=None):
    
    X_cov = (lamb*cov(X))
    #torch svd has bug. U and V not equal up to sign or permutation, for non-duplicate entries.
    #U, D, Vt = (lamb*X_cov).svd()
    
    U, D, Vt = linalg.svd(X_cov.cpu().numpy())
    U = torch.from_numpy(U.astype('float64')).to(device)
    #torch can't take exponential on int64 types.
    D_exp = torch.from_numpy(np.exp(D.astype('float64'))).to(device).diag()
    
    #projection of noise onto the singular vecs. 
    if noise_vecs is not None:
        n_noise = noise_vecs.size(0)
        print(utils.inner_mx(noise_vecs, U)[:, :int(1.5*n_noise)])
                    
    m = torch.mm(U, D_exp)
    m = torch.mm(m, U.t())
    
    assert m.max().item() < float('Inf')    
    m_tr =  m.diag().sum()
    m = m / m_tr
    
    return m.to(torch.float32)


def top_dir(X, opt, noise_vecs=None):
    X = X - X.mean(dim=0, keepdim=True)    
    X_cov = cov(X)
    if False:
        u, d, v_t = linalg.svd(X_cov.cpu().numpy())
        #pdb.set_trace()
        u = u[:opt.n_top_dir]        
    else:
        #convert to numpy tensor. 
        sv = decom.TruncatedSVD(opt.n_top_dir)
        sv.fit(X.cpu().numpy())
        u = sv.components_
    
    if noise_vecs is not None:
        
        print('inner of noise with top cov dirs')
        n_noise = noise_vecs.size(0)
        sv1 = decom.TruncatedSVD(n_noise)
        sv1.fit(X.cpu().numpy())
        u1 = torch.from_numpy(sv1.components_).to(device)
        print(utils.inner_mx(noise_vecs, u1)[:, :int(1.5*n_noise)])
    
    #U, D, V = svd(X, k=1)    
    return torch.from_numpy(u).to(device)
    
'''
Input:
-X: shape (n_sample, n_feat)
'''

def compute_tau1(X, select_idx, opt, noise_vecs):
    
    X = torch.index_select(X, dim=0, index=select_idx)
    #input should already be centered!
    X_centered = X - X.mean(0, keepdim=True)  
    M = compute_m(X, opt.lamb, noise_vecs) 
    X_m = torch.mm(X_centered, M) #M should be symmetric, so not M.t()
    tau1 = (X_centered*X_m).sum(-1)
        
    return tau1

'''
Input: already centered
'''

def compute_tau0(X, select_idx, opt, noise_vecs=None):
    X = torch.index_select(X, dim=0, index=select_idx)
    cov_dir = top_dir(X, opt, noise_vecs)
    #top dir can be > 1
    cov_dir = cov_dir.sum(dim=0, keepdim=True)
    tau0 = (torch.mm(cov_dir, X.t())**2).squeeze()    
    return tau0

'''
compute tau2, v^tM^{-1}v
'''

def compute_tau1_tau0(X, opt):
    use_dom_eval = True
    if use_dom_eval:
        #dynamically set lamb now
        #find dominant eval.
        dom_eval, _ = utils.dominant_eval_cov(X)
        opt.lamb = 1./dom_eval * opt.lamb_multiplier        
        lamb = opt.lamb        

    #noise_vecs can be used for visualization.
    no_evec = True
    if no_evec:
        noise_vecs = None
        
    def get_select_idx(tau_method):
        if device == 'cuda':
            select_idx = torch.cuda.LongTensor(list(range(X.size(0))))
        else:
            select_idx = torch.LongTensor(list(range(X.size(0))))
        n_removed = 0
        for _ in range(opt.n_iter):
            tau1 = tau_method(X, select_idx, opt, noise_vecs)
            #select idx to keep
            cur_select_idx = torch.topk(tau1, k=int(tau1.size(0)*(1-opt.remove_p)), largest=False)[1]
            #note these points are indices of current iteration            
            n_removed += (select_idx.size(0) - cur_select_idx.size(0))
            select_idx = torch.index_select(select_idx, index=cur_select_idx, dim=0)            
        return select_idx, n_removed, tau1

    if opt.fast_jl:
        select_idx1, n_removed1, tau1 = get_select_idx(compute_tau1_fast)
    else:
        select_idx1, n_removed1, tau1 = get_select_idx(compute_tau1)
        
    #acc1 = compute_acc_with_idx(select_idx, cor_idx, X, n_removed)    
    if DEBUG:
        print('new acc1 {}'.format(acc1))
        M = compute_m(X, opt.lamb, noise_vecs)
        X_centered = X - X.mean(0,keepdim=True)
        X_m = torch.mm(X_centered, M) #M should be symmetric, so not M.t()
        tau1 = (X_centered*X_m).sum(-1)
        print('old acc1 {}'.format(compute_acc(tau1, cor_idx)))
        pdb.set_trace()
    
    '''
    if device == 'cuda':
        select_idx = torch.cuda.LongTensor(range(X.size(0)))
    else:
        select_idx = torch.LongTensor(range(X.size(0)))
    for _ in range(opt.n_iter):
        tau0 = compute_tau0(X, select_idx, opt)
        cur_select_idx = torch.topk(tau0, k=tau1.size(0)*(1-opt.remove_p), largest=False)[1]
        select_idx = torch.index_select(select_idx, index=cur_select_idx, dim=0)
    '''
    select_idx0, n_removed0, tau0 = get_select_idx(compute_tau0)    
    
    return tau1, select_idx1, n_removed1, tau0, select_idx0, n_removed0


In [None]:
# Baseline methods (copied verbatim from baselines.py)
def l2(X):
    scores = ((X - X.mean(0))**2).sum(-1)    
    return scores
    
'''
Input:
-X, Y: 2D tensors
'''

def isolation_forest(X):
    X = X.cpu().numpy()
    model = sklearn.ensemble.IsolationForest(contamination='auto', behaviour='new')
    #labels = model.fit_predict(X)
    model.fit(X)
    scores = -model.decision_function(X)
    
    #labels = torch.from_numpy(labels).to(utils.device)  
    #scores = torch.zeros_like(labels)
    #scores[labels==-1] = 1
    return torch.from_numpy(scores).to(utils.device)

'''
Elliptic envelope
Returns: The higher the score, the more likely to be outlier.
'''

def knn_dist_lof(X, k=10):
    X_len = len(X)
    
    #dist_ = dist(X, X)    
    #min_dist, min_idx = torch.topk(dist_, dim=-1, k=k, largest=False)
    
    min_dist, min_idx = utils.dist_rank(X, k=k, largest=False)
    kth_dist = min_dist[:, -1]
    # sum max(kth dist, dist(o, p)) over neighbors o of p
    kth_dist_exp = kth_dist.expand(X.size(0), -1) #n x n
    kth_dist = torch.gather(input=kth_dist_exp, dim=1, index=min_idx)
    
    min_dist[kth_dist > min_dist] = kth_dist[kth_dist > min_dist]
    #inverse of lrd scores
    dist_avg = min_dist.mean(-1).clamp(min=0.0001)
    
    compare_density = False
    if compare_density:
        #compare with density. Get kth neighbor index.
        dist_avg_exp = dist_avg.unsqueeze(-1) / dist_avg.unsqueeze(0).expand(X_len, -1)
        #lof = torch.zeros(X_len, 1).to(utils.device)
        lof = torch.gather(input=dist_avg_exp, dim=-1, index=min_idx).sum(-1)
        torch.scatter_add_(lof, dim=-1, index=min_idx, src=dist_avg_exp)    
        return -lof.squeeze(0)

    return dist_avg

'''
LoOP: kNN based method using quadratic mean distance to estimate density.
LoOP (Local Outlier Probabilities) (Kriegel et al. 2009a)
'''

def ellenv(X):
    X = X.cpu().numpy()
    model = sklearn.covariance.EllipticEnvelope(contamination=0.2)
    #ensemble.IsolationForest(contamination='auto', behaviour='new')
    model.fit(X)
    scores = -model.decision_function(X)
    
    #labels = torch.from_numpy(labels).to(utils.device)  
    #scores = torch.zeros_like(labels)
    #scores[labels==-1] = 1
    return torch.from_numpy(scores).to(utils.device)

'''
Local outlier factor.

'''

## Usage

Define your tabular data matrix `X` as a `torch.FloatTensor` on `utils.device`.

Example (you provide X):
```python
# X = your torch.FloatTensor of shape (n_samples, n_features)
# X = X - X.mean(0)  # optional centering
```
Then obtain QUE and spectral scores:
```python
opt = utils.parse_args()  # default args
opt.remove_p = 0.1
opt.lamb_multiplier = 4
opt.n_top_dir = 1
opt.n_iter = 1
opt.fast_jl = False
tau1, select_idx1, n_removed1, tau0, select_idx0, n_removed0 = compute_tau1_tau0(X, opt)
```
And run baselines (examples):
```python
tau_l2 = l2(X)
tau_if = isolation_forest(X)
tau_lof = knn_dist_lof(X)
tau_ell = ellenv(X)
```