In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, issparse
from scipy.sparse.linalg import norm as spnorm
import pickle
import sklearn
from scipy import sparse
from ntf_cython.random import algo42, algo44, admm, rel_error
from savvy_factorize.compression import algo41, algo43, algo45, algo46, structured_compression, count_gauss
from sklearn.decomposition.nmf import _initialize_nmf
from numpy.linalg import norm, solve
import seaborn as sns
from savvy_factorize.structured_nmf import compression_algo

%matplotlib inline

In [2]:
%%time
df = pd.read_csv('/Users/user/Documents/Medicare_Provider_Util_Payment_PUF_CY2013/Medicare_Provider_Util_Payment_PUF_CY2013.txt', sep= '\t')



CPU times: user 46.5 s, sys: 6.97 s, total: 53.4 s
Wall time: 54.5 s


In [3]:
#Clean up first row of the data
df = df[1:]
np.shape(df)

(9287876, 28)

In [4]:
%%time
row_name = 'NPI'
col_name = 'HCPCS_CODE'
measure = 'AVERAGE_SUBMITTED_CHRG_AMT'
# filters: averages are calculated by NPI, HCPCS_CODE and PLACE_OF_SERVICE
# so need to fix PLACE_OF_SERVICE to not be forced to average averages.
# The rest of the filters are up to you.
filters = (df.PLACE_OF_SERVICE == 'F')
filters &= (df.NPPES_ENTITY_CODE == 'O')
filters &= (df.NPPES_PROVIDER_COUNTRY == 'US')
# data = df[filters][[row_name, col_name, measure]].dropna().set_index([row_name, col_name])
data = df[filters].dropna(subset=[measure]).groupby([row_name, col_name])[measure].max()
idx_row = data.index.names.index(row_name)
idx_col = data.index.names.index(col_name)
M = coo_matrix((data.values,
                (data.index.labels[idx_row], data.index.labels[idx_col])),
               shape=[len(data.index.levels[idx_row]), len(data.index.levels[idx_col])]).astype(float)
print(M.shape)

(14822, 1376)
CPU times: user 2.61 s, sys: 730 ms, total: 3.34 s
Wall time: 3.73 s


In [5]:
m = 10000
n = 1000
k = 100
A0 = np.random.randn(m, k)
A1 = np.random.randn(k, n)
A = A0.dot(A1)

In [6]:
def generalized_admm(A, q=1, r=100, max_iter = 1000, eps = 0.01, oversampling = 10, 
                     oversampling_factor = 10, algo='algo42',random_state=None):
    
    m, n = A.shape
    L, R = compression_algo(A, q, r, max_iter, eps, oversampling,
                            oversampling_factor, algo)
    r = L.shape[1]
    np.random.seed(random_state)
    U = np.abs(np.random.randn(A.shape[0], r))
    V = np.abs(np.random.randn(r, A.shape[1]))
    A0 = L.T.dot(A).dot(R.T)
    Y = V.dot(R.T)
    #initialize parameters
    Lam = np.zeros((m, r))
    Phi = np.zeros((r, n))
    I = np.eye(r)
    lam = 1.
    phi = 1.
    c = 1.
    
    for k in range(max_iter):
        X = solve(Y.dot(Y.T) + lam * I,
                  Y.dot(A0.T) + (lam * U.T - Lam.T).dot(L)).T
        Y = solve(X.dot(X.T) + phi * I,
                  X.T.dot(A0) + phi * V.dot(R.T) - Phi.dot(R.T))
        U = L.dot(X) + Lam / lam
        U[U < 0] = 0
        V = Y.dot(R) + Phi / phi
        V[V < 0] = 0
        Lam = Lam + c * lam * (L.dot(X) - U)
        Phi = Phi + c * phi * (Y.dot(R) - V)
    return U, V
    

In [None]:
%%time
U, V = generalized_admm(np.array(M.todense()), q=1, r=50, max_iter = 5, eps = 0.00001, oversampling = 10, 
                     oversampling_factor = 10, algo='algo46',random_state=2)

In [None]:
rel_error(M.todense(), U.dot(V))

In [9]:
U.shape

(14822, 50)

In [11]:
V.shape

(1114, 1376)

In [12]:
U.dot(V).shape

(14822, 1376)

In [13]:
A.shape

(10000, 1000)

In [14]:
np.where(V > 0)

(array([   0,    0,    0, ..., 1113, 1113, 1113]),
 array([   0,    1,    2, ..., 1373, 1374, 1375]))

In [16]:
np.where(U > 0)

(array([    0,     0,     0, ..., 14821, 14821, 14821]),
 array([   1,    3,    6, ..., 1068, 1092, 1102]))