In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, issparse
from scipy.sparse.linalg import norm as spnorm
import pickle
import sklearn
from scipy import sparse
from ntf_cython.random import algo42, algo44, admm, rel_error
from savvy_factorize.compression import algo41, algo43, algo45, algo46, structured_compression, count_gauss
from sklearn.decomposition.nmf import _initialize_nmf
from numpy.linalg import norm, solve

%matplotlib inline

In [2]:
%%time
df = pd.read_csv('/Users/user/Documents/Medicare_Provider_Util_Payment_PUF_CY2013/Medicare_Provider_Util_Payment_PUF_CY2013.txt', sep= '\t')



CPU times: user 53.2 s, sys: 14.5 s, total: 1min 7s
Wall time: 1min 33s


In [3]:
#Clean up first row of the data
df = df[1:]
np.shape(df)

(9287876, 28)

In [4]:
%%time
row_name = 'NPI'
col_name = 'HCPCS_CODE'
measure = 'AVERAGE_SUBMITTED_CHRG_AMT'
# filters: averages are calculated by NPI, HCPCS_CODE and PLACE_OF_SERVICE
# so need to fix PLACE_OF_SERVICE to not be forced to average averages.
# The rest of the filters are up to you.
filters = (df.PLACE_OF_SERVICE == 'F')
filters &= (df.NPPES_ENTITY_CODE == 'O')
filters &= (df.NPPES_PROVIDER_COUNTRY == 'US')
# data = df[filters][[row_name, col_name, measure]].dropna().set_index([row_name, col_name])
data = df[filters].dropna(subset=[measure]).groupby([row_name, col_name])[measure].max()
idx_row = data.index.names.index(row_name)
idx_col = data.index.names.index(col_name)
M = coo_matrix((data.values,
                (data.index.labels[idx_row], data.index.labels[idx_col])),
               shape=[len(data.index.levels[idx_row]), len(data.index.levels[idx_col])]).astype(float)
print(M.shape)

(14822, 1376)
CPU times: user 1.47 s, sys: 1.1 s, total: 2.56 s
Wall time: 3.03 s


In [30]:
m = 10000
n = 500
np.random.seed(1)
A = np.abs(np.random.randn(m, n))
r = 400

In [71]:
import numpy as np
from ntf_cython.random import algo42, algo44, rel_error
from savvy_factorize.compression import algo41, algo43, algo45
from savvy_factorize.compression import algo46, structured_compression
from savvy_factorize.compression import count_gauss
from savvy_factorize.selection import xray, SPA
from ntf_cython.nmf import bpp
from numpy.linalg import solve


def compression_left(A, q=1, r=100, eps=0.01, oversampling=10,
                     oversampling_factor=10, algo='algo44'):
    """
    Compute the projection matrix with orthonormal columns
    Parameters
    ----------
    A: numpy.array
       Input data
    q: integer, default: 1
       Exponent used in algo43 and algo44
    r: integer, default: 100
       Target rank
    eps: double, default:0.01
       Tolerance value used in algo42
    oversampling: integer, default: 10
       A parameter granting more freedom in the choice of Q
       used in structured_compression algorithm
    oversampling_factor: integer, default:10
       A parameter used in count_gauss compression algorithm
    algo: compression algorithm used
    """
    if algo == 'algo41':
        L = algo41(A, r)
    elif algo == 'algo42':
        L = algo42(A, eps, r)
    elif algo == 'algo43':
        L = algo43(A, q, r)
    elif algo == 'algo44':
        L = algo44(A, q, r)
    elif algo == 'algo45':
        L = algo45(A, r)
    elif algo == 'algo46':
        L = algo46(A, r)
    elif algo == 'structured_compression':
        L = structured_compression(A, q, r, oversampling)
    elif algo == 'count_gaussian':
        L, Z = count_gauss(A, r, oversampling_factor)
    else:
        L, _ = np.linalg.qr(A)

    return L


def compression_right(A, q=1, r=100, eps=0.01, oversampling=10,
                      oversampling_factor=10, algo='algo44'):
    """
    Compute the projection matrix with orthonormal columns
    Parameters
    ----------
    A: numpy.array
       Input data
    q: integer, default: 1
       Exponent used in algo43 and algo44
    r: integer, default: 100
       Target rank
    eps: double, default:0.01
       Tolerance value used in algo42
    oversampling: integer, default: 10
       A parameter granting more freedom in the choice of Q
       used in structured_compression algorithm
    oversampling_factor: integer, default:10
       A parameter used in count_gauss compression algorithm
    algo: compression algorithm used
    """
    if algo == 'algo41':
        R = algo41(A.T, r).T
    elif algo == 'algo42':
        R = algo42(A.T, eps, r).T
    elif algo == 'algo43':
        R = algo43(A.T, q, r).T
    elif algo == 'algo44':
        R = algo44(A.T, q, r).T
    elif algo == 'algo45':
        R = algo45(A.T, r).T
    elif algo == 'algo46':
        R = algo46(A.T, r).T
    elif algo == 'structured_compression':
        R = structured_compression(A.T, q, r, oversampling).T
    elif algo == 'count_gaussian':
        R, Zt = count_gauss(A, r, oversampling_factor).T
    else:
        R, _ = np.linalg.qr(A.T)

    return R


def generalized_admm_1(A, algo=None, q=1, r=100, max_iter=1000,
                       eps=0.01, oversampling=10, oversampling_factor=20,
                       lam=1., phi=1., c=1., limit=7, random_state=None,
                       tol=0.7):
    """
    NMF using structured random compression via alternating direction method
    of multipliers(ADMM)
    Parameters
    ----------
    A: numpy.array
        Input data
    q: integer, default: 1
        Exponent used in algo43 and algo44
    r: integer, default: 1
        Target rank
    max_iter: integer, default:1000
        Maximum number of iterations for ADMM
    eps: double, default:0.01
        Tolerance value used in algo42
    oversampling: integer, default:10
        A parameter granting more freedom in the choice of Q used in
        structured_compression algorithm
    oversampling_factor: integer, default:10
        A parameter used in count_gauss compression algorithm
    algo: compression algorithm used
    random_state: integer
    """

    m, n = A.shape
    l = min(n, max(oversampling_factor, r+10))

    OmegaL = np.random.randn(n, l)
    H = np.dot(A, OmegaL)

    for j in range(0, limit):
        H = np.dot(A, np.dot(A.T, H))

    L = compression_left(H, algo, q, l, eps, oversampling,
                         oversampling_factor)
    LA = np.dot(L.T, A)

    OmegaR = np.random.randn(l, l)
    H = np.dot(OmegaR, LA)

    for j in range(0, limit):
        H = np.dot(np.dot(H, LA.T), LA)

    R = compression_left(H.T, algo, q, l, eps, oversampling,
                         oversampling_factor)
    R = R.T
    M = np.dot(LA, R.T)

    np.random.seed(random_state)
    U = np.random.rand(m, r)
    V = np.random.randn(r, n)
    Y = V.dot(R.T)

    Lam = np.zeros((m, r))
    Phi = np.zeros((r, n))
    I = np.eye(r)
    iter_ = 0

    relative_error = []

    while iter_ <= max_iter:
        X = solve((Y.dot(Y.T) + lam * I).T,
                  (M.dot(Y.T) + (lam * L.T.dot(U) - L.T.dot(Lam))).T).T
        Y = solve(X.T.dot(X) + phi * I,
                  X.T.dot(M) + phi * V.dot(R.T) - Phi.dot(R.T))
        U = np.maximum(L.dot(X) + Lam/lam, 1e-15)
        print(U.shape)
        V = np.maximum(Y.dot(R) + Phi/phi, 1e-15)
        #print(V.shape)
        
        #print(np.any(np.logical_and((np.sum(U>0, axis=0) == 0),((np.sum(V>0, axis=1)).T == 0))))
        
        
        
        Lam = Lam + c * lam * (L.dot(X) - U)
        Phi = Phi + c * phi * (Y.dot(R) - V)
        relative_error.append(rel_error(A, U.dot(V)))

        iter_ += 1

    return U, V, relative_error

In [72]:
%%time
U, V, relative_error = generalized_admm_1(A, algo='algo42', q=5, r=50, max_iter = 100, eps = 0.0001, oversampling = 10, 
                     oversampling_factor = 10, random_state=3)

(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(10000, 50)
(100

In [66]:
rel_error(A, U.dot(V))

0.60185016613894182

In [67]:
%%time
U, V, relative_error = generalized_admm_1(np.array(M.todense()), algo='algo42', q=5, r=50, max_iter = 100, eps = 0.0001, oversampling = 10, 
                     oversampling_factor = 10, random_state=2)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
CPU times: user 46.5 s, sys: 16.9 s, total: 1min 3s
Wall time: 43.3 s


In [68]:
relative_error

[2.0803696275319497,
 1.9950057881660841,
 1.9080484872100927,
 1.8526827789208096,
 1.8365027472946804,
 1.8645036177675118,
 1.9409041053869827,
 2.0754596550860649,
 2.2868615826920271,
 2.6035583584112332,
 3.0456161907698411,
 3.6193349672042245,
 4.3109929457712362,
 5.0806633811503144,
 5.8425043017519673,
 6.5615789058126675,
 7.2050673809637118,
 7.7442174001954571,
 8.1884290015392551,
 8.5341816681004854,
 8.7794962621247805,
 8.9474280953132954,
 9.0341035178972167,
 9.0576249636093227,
 9.031717060172717,
 8.9775241960910499,
 8.8792883574970887,
 8.7468990260116541,
 8.5967846834588091,
 8.4314276008660585,
 8.2657185519383436,
 8.0946479632479154,
 7.9222916645370081,
 7.7522957209689505,
 7.5921182912860052,
 7.4363997417929513,
 7.2996999384804937,
 7.1671198173586959,
 7.0465171861071028,
 6.9448424581580577,
 6.852054057048484,
 6.7835166525287365,
 6.731791880821917,
 6.6958392198175005,
 6.6690325720102699,
 6.659058045624513,
 6.6622016585721848,
 6.68447260182957