In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, issparse
from scipy.sparse.linalg import norm as spnorm
from ntf_cython.nmf import NMF as NMF_BPP
from ntf_cython.nmf import bpp
from ntf_cython.random import algo42
import scipy.optimize as optimize
from sklearn.decomposition.nmf import _initialize_nmf
import pickle
import sklearn
from scipy import sparse
from savvy_factorize.compression import algo41, algo43, algo45, algo46, structured_compression, count_gauss
from savvy_factorize.selection import xray, SPA
from ntf_cython.random import rel_error

%matplotlib inline

In [4]:
%%time
df = pd.read_csv('/Users/user/Documents/Medicare_Provider_Util_Payment_PUF_CY2013/Medicare_Provider_Util_Payment_PUF_CY2013.txt', sep= '\t')



CPU times: user 51.6 s, sys: 11.9 s, total: 1min 3s
Wall time: 1min 5s


In [5]:
#Clean up first row of the data
df = df[1:]
np.shape(df)

(9287876, 28)

In [6]:
%%time
row_name = 'NPI'
col_name = 'HCPCS_CODE'
measure = 'AVERAGE_SUBMITTED_CHRG_AMT'
# filters: averages are calculated by NPI, HCPCS_CODE and PLACE_OF_SERVICE
# so need to fix PLACE_OF_SERVICE to not be forced to average averages.
# The rest of the filters are up to you.
filters = (df.PLACE_OF_SERVICE == 'F')
filters &= (df.NPPES_ENTITY_CODE == 'O')
filters &= (df.NPPES_PROVIDER_COUNTRY == 'US')
# data = df[filters][[row_name, col_name, measure]].dropna().set_index([row_name, col_name])
data = df[filters].dropna(subset=[measure]).groupby([row_name, col_name])[measure].max()
idx_row = data.index.names.index(row_name)
idx_col = data.index.names.index(col_name)
M = coo_matrix((data.values,
                (data.index.labels[idx_row], data.index.labels[idx_col])),
               shape=[len(data.index.levels[idx_row]), len(data.index.levels[idx_col])]).astype(float)
print(M.shape)

(14822, 1376)
CPU times: user 2.31 s, sys: 780 ms, total: 3.09 s
Wall time: 3.09 s


# SEPARABLE NMF

In [7]:
def sep_nmf(A, q=1, r=50, max_iter = 1, eps = 0.01, oversampling = 10, 
                     oversampling_factor = 10, algo='algo42', run_iter=5):
    

    #H = np.abs(np.random.randn(r, A.shape[1]))
    LA = compression_algo(A, r=r, algo=algo)[0]

    A_ = LA.T.dot(A)

    #check for relative error for the left projected matrix
    QA, RA = np.linalg.qr(A_)
    QRA = LA.dot(QA).dot(RA)
    print(rel_error(A, QRA))

    #print(A.shape)
    cols = xray(A_, r)
    #print(A[:,cols].shape)
    H = bpp(A_[:,cols], A_)
    #print(H.shape)
    W = A[:,cols]
    
    print("Here's the relative error")
    print(rel_error(A, W.dot(H)))
        
    return W,H

In [6]:
%%time
W, H = sep_nmf(np.array(M.todense()))

1.00336403976e-14
Here's the relative error
0.797524013023
CPU times: user 3min 13s, sys: 48.9 s, total: 4min 2s
Wall time: 2min 22s


# RANDOMIZED PROJECTED BPP

In [8]:
import numpy as np
from ntf_cython.random import algo42, algo44
from savvy_factorize.compression import algo41, algo43, algo45
from savvy_factorize.compression import algo46, structured_compression
from savvy_factorize.compression import count_gauss
from savvy_factorize.selection import xray, SPA
from ntf_cython.nmf import bpp
from numpy.linalg import solve


def compression_left(A, algo=None, q=1, r=100, eps=0.01, oversampling=10,
                     oversampling_factor=10):
    """
    Compute the projection matrix with orthonormal columns

    Parameters
    ----------
    A: numpy.array
       Input data
    q: integer, default: 1
       Exponent used in algo43 and algo44
    r: integer, default: 100
       Target rank
    eps: double, default:0.01
       Tolerance value used in algo42
    oversampling: integer, default: 10
       A parameter granting more freedom in the choice of Q
       used in structured_compression algorithm
    oversampling_factor: integer, default:10
       A parameter used in count_gauss compression algorithm
    algo: compression algorithm used
    """
    if algo == 'algo41':
        L = algo41(A, r)
    elif algo == 'algo42':
        L = algo42(A, eps, r)
    elif algo == 'algo43':
        L = algo43(A, q, r)
    elif algo == 'algo44':
        L = algo44(A, q, r)
    elif algo == 'algo45':
        L = algo45(A, r)
    elif algo == 'algo46':
        L = algo46(A, r)
    elif algo == 'structured_compression':
        L = structured_compression(A, q, r, oversampling)
    elif algo == 'count_gaussian':
        L, Z = count_gauss(A, r, oversampling_factor)
    else:
        L, _ = np.linalg.qr(A)

    return L


def compression_right(A, algo=None, q=1, r=100, eps=0.01, oversampling=10,
                      oversampling_factor=10):
    """
    Compute the projection matrix with orthonormal columns

    Parameters
    ----------
    A: numpy.array
       Input data
    q: integer, default: 1
       Exponent used in algo43 and algo44
    r: integer, default: 100
       Target rank
    eps: double, default:0.01
       Tolerance value used in algo42
    oversampling: integer, default: 10
       A parameter granting more freedom in the choice of Q
       used in structured_compression algorithm
    oversampling_factor: integer, default:10
       A parameter used in count_gauss compression algorithm
    algo: compression algorithm used
    """
    if algo == 'algo41':
        R = algo41(A.T, r).T
    elif algo == 'algo42':
        R = algo42(A.T, eps, r).T
    elif algo == 'algo43':
        R = algo43(A.T, q, r).T
    elif algo == 'algo44':
        R = algo44(A.T, q, r).T
    elif algo == 'algo45':
        R = algo45(A.T, r).T
    elif algo == 'algo46':
        R = algo46(A.T, r).T
    elif algo == 'structured_compression':
        R = structured_compression(A.T, q, r, oversampling).T
    elif algo == 'count_gaussian':
        R, Zt = count_gauss(A, r, oversampling_factor).T
    else:
        R, _ = np.linalg.qr(A.T)

    return R


In [11]:
def random_projected_sepnmf(A, q=1, r=50, max_iter = 50, eps = 0.01, oversampling = 10, 
                     oversampling_factor = 10, algo='algo42'):
    
    #H = np.abs(np.random.randn(r, A.shape[1]))
    L = compression_left(A, r=r, algo=algo)
    R = compression_right(A, r=r, algo=algo)
    A_ = L.T.dot(A)


    #print(A.shape)
    cols = xray(A_, r)
    #print(A[:,cols].shape)
    H = bpp(A_[:,cols], A_)
    #print(H.shape)
    W = A[:,cols]

    for _ in range(max_iter):
        H = bpp(LA.T.dot(W), LA.T.dot(A), H>0)
        W = bpp(RA2.dot(H.T), RA2.dot(A.T), W.T>0).T
        print("Here's the relative error")
        print(rel_error(A, W.dot(H)))
        
    return W,H

In [14]:
%%time
W, H = random_projected_sepnmf(np.array(M.todense()),algo='algo41')



NameError: name 'LA' is not defined

In [29]:
def random_projected_bppnmf(A, q=1, r=50, max_iter = 50, eps = 0.01, oversampling = 10, 
                     oversampling_factor = 10, algo='algo42'):
    
    #H = np.abs(np.random.randn(r, A.shape[1]))
    LA, RA2 = compression_algo(A, r=r, algo=algo)

    W = np.random.rand(A.shape[0], r)
    H = bpp(W, A)

    for _ in range(max_iter):
        H = bpp(LA.T.dot(W), LA.T.dot(A), H>0)
        W = bpp(RA2.dot(H.T), RA2.dot(A.T), W.T>0).T
        print("Here's the relative error")
        print(rel_error(A, W.dot(H)))
        
    return W,H

In [30]:
%%time
W, H = random_projected_bppnmf(np.array(M.todense()))

Here's the relative error
0.691565351465
Here's the relative error
0.526991553701
Here's the relative error
0.483356052582
Here's the relative error
0.465404512195
Here's the relative error
0.453627406344
Here's the relative error
0.448086638416
Here's the relative error
0.445599015797
Here's the relative error
0.44452733471
Here's the relative error
0.443771296576
Here's the relative error
0.442974199254
Here's the relative error
0.44225052699
Here's the relative error
0.44179146558
Here's the relative error
0.441533151703
Here's the relative error
0.441371781349
Here's the relative error
0.44126032887
Here's the relative error
0.441180026421




Here's the relative error
0.441121526309
Here's the relative error
0.441078188342
Here's the relative error
0.441045260454
Here's the relative error
0.441019085815
Here's the relative error
0.440997431285
Here's the relative error
0.440978682224
Here's the relative error
0.440963104483
Here's the relative error
0.440949629665
Here's the relative error
0.440937466448
Here's the relative error
0.440926322533
Here's the relative error
0.44091581441
Here's the relative error
0.440905612387
Here's the relative error
0.44089547133
Here's the relative error
0.440885189455
Here's the relative error
0.440874550331
Here's the relative error
0.440863339313
Here's the relative error
0.440851234712
Here's the relative error
0.440837942175
Here's the relative error
0.440823158615
Here's the relative error
0.440806197567
Here's the relative error
0.440786515221
Here's the relative error
0.440763969835
Here's the relative error
0.440738760273
Here's the relative error
0.440711117315
Here's the relativ