In [51]:
# Load libraries
import numpy as np

In [52]:
from numba import jit
from numba import int32, int64, float32, float64
import numba

In [53]:
import time

def timer(f, *args, **kwargs):
    start = time.clock()
    ans = f(*args, **kwargs)
    return ans, time.clock() - start

def report(fs, *args, **kwargs):
    ans, t = timer(fs[0], *args, **kwargs)
    for f in fs[1:]:
        ans_, t_ = timer(f, *args, **kwargs)
        print('%s: %.1f' % (f.__name__, t/t_))

In [54]:
# Set seed
np.random.seed(10)

In [55]:
# Doc params
V = 50
N = 300
K = 3
M = 2

In [56]:
# Set true params
alpha_true = np.array([10, 1, 2])
beta_true = np.random.randint(1, 10, V)

In [57]:
# Generate data
phi_true = np.zeros((K, V))
for k in range(K):
    phi_true[k, :] = np.random.dirichlet(beta_true)

In [58]:
theta_true = np.zeros((M, K))
for m in range(M):
    theta_true[m,:] = np.random.dirichlet(alpha_true)

In [59]:
doc_lens = np.random.randint(100, N, M)
z_true = {}
w = {}
for m in range(M):
    z_true[m] = []
    w[m] = []
    for n in range(doc_lens[m]):
        z_true[m].extend(np.nonzero(np.random.multinomial(1, theta_true[m,:]))[0])
        w[m].extend(np.nonzero(np.random.multinomial(1, phi_true[z_true[m][n], :]))[0])

In [60]:
#z_true = np.zeros((M, N), dtype = "int")
#w = np.zeros((M, N), dtype = "int")
#for m in range(M):
#    for n in range(N):
#        z_true[m, n] = np.nonzero(np.random.multinomial(1, theta_true[m,:]))[0]
#        w[m, n] = np.nonzero(np.random.multinomial(1, phi_true[z_true[m, n], :]))[0]
len(w)

2

In [61]:
# Set initial z randomly
z = {}
for m in range(M):
    z[m] = []
    for n in range(doc_lens[m]):
        z[m].extend(np.nonzero(np.random.multinomial(1, np.ones(K)/K))[0])

In [88]:
# Create count matrices
A = np.zeros((M, K))
for m in range(M):
    for k in range(K):
        A[m, k] = sum(np.array(z[m]) == k)

In [89]:
B = np.zeros((K, V))
for m in range(M):
    for n in range(doc_lens[m]):
        B[z[m][n], w[m][n]] += 1

In [90]:
C = np.zeros(K)
for m in range(M):
    for n in range(doc_lens[m]):
        C[z[m][n]] += 1

In [91]:
# Turning Z into a matrix
MaxLen = max([len(z[i]) for i in z.keys()])

Z = np.zeros((len(z.keys()), MaxLen), dtype = int)

for i in range(len(z.keys())):
    Z[i, 0:len(z[i])] = z[i]
Z_prev = Z

In [92]:
# Turning W into a matrix
MaxLen = max([len(w[i]) for i in w.keys()])

W = np.zeros((len(w.keys()), MaxLen), dtype = int)

for i in range(len(w.keys())):
     W[i, 0:len(w[i])] = w[i]

In [67]:
# set hyperparameters alpha and beta
alpha = np.ones(K)
beta = np.ones(V)
#alpha = alpha_true
#beta = beta_true

### Base Comparison

As a benchmark, we begin with the timing for a single iteration of the Gibbs sampler with no optimization

In [69]:
# start gibbs sampler
def Gibbs(M, doc_lens, Z, W, K, A, B, C, alpha, beta):
    num_iter = 1
    p = np.zeros(K)
    for i in range(num_iter):
        for m in range(M):
            for n in range(doc_lens[m]):
                A[m, int(Z[m,n])] -= 1  # Decrement N1
                B[int(Z[m,n]), int(W[m,n])] -= 1 # Decrement N2
                C[int(Z[m,n])] -= 1 # Decrement N3
                p = np.zeros(K)
                for k in range(K):
                    p[k] = (A[m, k] + alpha[k])*((B[k, int(W[m,n])] + beta[int(W[m,n])])/(C[k] + sum(beta)))
                p = p/sum(p) # This is actually doing k divisions... might be a modest speed up but we can parallelize easily with numba
                Z[m,n] = int(np.nonzero(np.random.multinomial(1, p))[0][0])
                A[m, int(Z[m,n])] += 1 # Increment N1
                B[int(Z[m,n]), int(W[m,n])] += 1 # Increment N2
                C[int(Z[m,n])] += 1 # Increment N3
    return A, B, C


In [70]:
%timeit -r30 -n30 Gibbs(M, doc_lens, Z, W, K, A, B, C, alpha, beta)

16.6 ms ± 520 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


## JIT optimizing the whole loop

### One loop at a time... Start with Loop 4

In [71]:
@jit(nopython=True, cache = False)
def Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta):
    p = np.zeros(K)
    for k in range(K):
        p[k] = (A[m, k] + alpha[k])*((B[k, TopicWord] + beta[TopicWord])/(C[k] + (beta[1]*len(beta))))
    return p

In [72]:
### Testing 
%timeit -r30 -n30 Loop4(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

The slowest run took 2247.76 times longer than the fastest. This could mean that an intermediate result is being cached.
169 µs ± 899 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


In [73]:
Loop4(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

array([ 2.42176871,  0.14754098,  1.85294118])

In [74]:
# start gibbs sampler
def Gibbs_faster(M, doc_lens, Z, W, K, A, B, C, alpha, beta):
    p = np.zeros(K)
    num_iter = 1
    for i in range(num_iter):
        for m in range(M):
            for n in range(doc_lens[m]):
                A[m, int(Z[m,n])] -= 1  # Decrement N1
                B[int(Z[m,n]), int(W[m,n])] -= 1 # Decrement N2
                C[int(Z[m,n])] -= 1 # Decrement N3
                p = Loop4(K, A, B, C, Z[m,n], W[m,n], alpha, beta)
                p = p/sum(p) 
                Z[m,n] = int(np.nonzero(np.random.multinomial(1, p))[0][0])
                A[m, int(Z[m,n])] += 1 # Increment N1
                B[int(Z[m,n]), int(W[m,n])] += 1 # Increment N2
                C[int(Z[m,n])] += 1 # Increment N3
    return A


In [75]:
%timeit -r30 -n30 Gibbs_faster(M, doc_lens, Z, W, K, A, B, C, alpha, beta)

6.37 ms ± 484 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


### Loop 3

In [76]:
@jit(nopython=True, cache = True)
def Loop3(K, A, B, C, WordTopic, TopicWord, alpha, beta):        
    p = Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta)
    return p

In [77]:
### Testing  THIS SHOULD BE THE SAME AS ABOVE!
%timeit -r30 -n30 Loop3(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

The slowest run took 52.18 times longer than the fastest. This could mean that an intermediate result is being cached.
6.4 µs ± 21.4 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


In [78]:
@jit
def get_multinom(p):
    """Multinomial Helper function"""
    p_sum = 0
    u = np.random.uniform(0,1)

    for i in range(len(p)):
        p_sum += p[i]
        if p_sum > u:
            return i
            break

In [79]:
@jit(nopython=True, cache = False)
def Loop3(K, A, B, C, Z, W, alpha, beta, m, doc_len):
    p = np.zeros(K)
    for n in range(doc_len):
        WordTopic = Z[m,n]
        TopicWord = W[m,n]
        A[m, WordTopic] -= 1  # Decrement N1
        B[WordTopic, TopicWord] -= 1 # Decrement N2
        C[WordTopic] -= 1

        p = Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta)
        p = p/(np.sum(p))
        
        Z[m,n] = get_multinom(p)
        WordTopic = Z[m,n]
        A[m, WordTopic] += 1 # Increment N1
        B[WordTopic, TopicWord] += 1 # Increment N2
        C[WordTopic] += 1 # Increment N3
    return A, B, C

In [81]:
### Test this!!!
for i in range(10000):
    A, B, C = Loop3(K, A, B, C, Z, W, alpha, beta, m, doc_lens[m])
    
A[1,:]

array([  33.,  154.,   34.])

In [82]:
### Testing Speedup
# start gibbs sampler
def Gibbs_even_faster(M, doc_lens, Z, W, K, A, B, C, alpha, beta):
    p = np.zeros(K)
    num_iter = 10000
    for i in range(num_iter):
        for m in range(M):
            A, B, C = Loop3(K, A, B, C, Z, W, alpha, beta, m, doc_lens[m])
    return A, B, C


In [84]:
#%timeit -r30 -n30 Gibbs_even_faster(M, doc_lens, Z, W, K, A, B, C, alpha, beta)

In [93]:
A, B, C = Gibbs_even_faster(M, doc_lens, Z, W, K, A, B, C, alpha, beta)

In [94]:
# Checking if we can infer the original parameters
theta = np.zeros((M, K))
for m in range(M):
    for k in range(K):
        theta[m , k] = (A[m, k] + alpha[k])/(sum(A[m,:]) + sum(alpha))
np.round(theta, 2)

array([[ 0.25,  0.55,  0.2 ],
       [ 0.28,  0.47,  0.25]])

In [95]:
np.round(theta_true, 2)

array([[ 0.83,  0.04,  0.13],
       [ 0.71,  0.11,  0.19]])

In [665]:
phi = np.zeros((K, V))
for k in range(K):
    for v in range(V):
        phi[k , v] = (B[k, v] + beta[v])/(sum(B[k,:]) + sum(beta))
np.round(phi, 2)

array([[ 0.01,  0.01,  0.04,  0.02,  0.02,  0.02,  0.06,  0.05,  0.03,
         0.01,  0.  ,  0.05,  0.01,  0.02,  0.01,  0.05,  0.01,  0.01,
         0.02,  0.  ,  0.01,  0.02,  0.01,  0.  ,  0.02,  0.  ,  0.02,
         0.  ,  0.03,  0.03,  0.02,  0.02,  0.04,  0.01,  0.  ,  0.07,
         0.01,  0.08,  0.  ,  0.  ,  0.  ,  0.01,  0.  ,  0.03,  0.03,
         0.02,  0.01,  0.  ,  0.03,  0.02],
       [ 0.04,  0.01,  0.01,  0.02,  0.03,  0.03,  0.01,  0.07,  0.  ,
         0.02,  0.01,  0.07,  0.04,  0.01,  0.02,  0.01,  0.01,  0.01,
         0.  ,  0.02,  0.02,  0.  ,  0.01,  0.04,  0.01,  0.  ,  0.03,
         0.  ,  0.02,  0.01,  0.01,  0.  ,  0.12,  0.03,  0.01,  0.08,
         0.  ,  0.02,  0.03,  0.01,  0.01,  0.01,  0.02,  0.03,  0.01,
         0.01,  0.  ,  0.01,  0.01,  0.01],
       [ 0.03,  0.05,  0.02,  0.01,  0.02,  0.04,  0.01,  0.03,  0.01,
         0.03,  0.03,  0.02,  0.05,  0.02,  0.  ,  0.02,  0.03,  0.03,
         0.03,  0.06,  0.02,  0.03,  0.01,  0.01,  0.02,  0.

In [667]:
np.round(phi_true,2)

array([[ 0.03,  0.04,  0.03,  0.01,  0.03,  0.04,  0.02,  0.04,  0.01,
         0.02,  0.03,  0.02,  0.05,  0.02,  0.  ,  0.03,  0.03,  0.02,
         0.02,  0.05,  0.02,  0.03,  0.01,  0.01,  0.01,  0.01,  0.04,
         0.01,  0.02,  0.01,  0.01,  0.01,  0.02,  0.01,  0.01,  0.03,
         0.03,  0.02,  0.  ,  0.01,  0.03,  0.03,  0.01,  0.04,  0.02,
         0.01,  0.01,  0.  ,  0.  ,  0.02],
       [ 0.04,  0.03,  0.01,  0.02,  0.02,  0.02,  0.02,  0.03,  0.  ,
         0.04,  0.01,  0.02,  0.01,  0.02,  0.  ,  0.03,  0.02,  0.02,
         0.04,  0.03,  0.04,  0.02,  0.01,  0.01,  0.03,  0.01,  0.06,
         0.  ,  0.03,  0.03,  0.01,  0.  ,  0.01,  0.01,  0.04,  0.02,
         0.02,  0.02,  0.02,  0.01,  0.04,  0.02,  0.02,  0.03,  0.  ,
         0.01,  0.02,  0.  ,  0.01,  0.01],
       [ 0.04,  0.03,  0.02,  0.03,  0.02,  0.03,  0.06,  0.02,  0.01,
         0.03,  0.03,  0.03,  0.02,  0.01,  0.  ,  0.03,  0.03,  0.03,
         0.02,  0.03,  0.02,  0.02,  0.  ,  0.  ,  0.03,  0.

## Loop 2

In [609]:
@jit(nopython=True, cache = False)
def Fastest_Gibbs(K, A, B, C, Z, W, alpha, beta, m, doc_lens, M, num_iter):
    for i in range(num_iter):
        for m in range(M):
            A, B, C = Loop3(K, A, B, C, Z, W, alpha, beta, m, doc_lens[m])
    return A, B, C

In [610]:
%timeit -r30 -n30 Fastest_Gibbs(K, A, B, C, Z, W, alpha, beta, m, doc_lens, M, 1)

The slowest run took 36.63 times longer than the fastest. This could mean that an intermediate result is being cached.
714 µs ± 2.07 ms per loop (mean ± std. dev. of 30 runs, 30 loops each)


# WOW... Hope those results are legit... let's make sure we get the same answers

In [161]:
%%cython -a

import cython
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

def Gibbs(long[:,:] Z, long[:] doc_lens, long[:,:] W,
          long[:,:] A, long[:,:] B, long[:,:] C, double[:] p,
         double[:] alpha, double[:] beta, long[:] K):
    num_iter = 30
    M = 2
    # Need gil for some reason: Assignment of Python object not allowed without gil
    for i in range(num_iter):
        for m in range(M):
            for n in range(doc_lens[m]):
                WordTopic = Z[m,n]
                TopicWord = W[m,n]
                A[m, WordTopic] -= 1  # Decrement N1
                B[WordTopic, TopicWord] -= 1 # Decrement N2
                C[WordTopic] -= 1 # Decrement N3
                p = np.zeros(K)
                for k in range(K):
                    p[k] = (A[m, k] + alpha[k])*((B[k, WordTopic] + beta[TopicWord])/(C[k] + sum(beta)))
                p = p/sum(p) # This is actually doing k divisions... might be a modest speed up but we can parallelize easily with numba
                Z[m,n] = int(np.nonzero(np.random.multinomial(1, p))[0][0])
                A[m, WordTopic] += 1 # Increment N1
                B[WordTopic, TopicWord] += 1 # Increment N2
                C[WordTopic] += 1 # Increment N3
    return A, B

Compiler crash in ExpandInplaceOperators

ModuleNode.body = StatListNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:2:0)
StatListNode.stats[2] = StatListNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:8:0)
StatListNode.stats[0] = CompilerDirectivesNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:8:0)
CompilerDirectivesNode.body = StatListNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:8:0)
StatListNode.stats[0] = DefNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:8:0,
    modifiers = [...]/0,
    name = 'Gibbs',
    num_required_args = 10,
    py_wrapper_required = True,
    reqd_kw_flags_cname = '0',
    used = True)
DefNode.body = StatListNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:11:4,
    is_terminator = True)
StatListNode.stats[2] = ForInStatNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:14:4)
ForInStatNode.body = StatListNode(_cython_magic_9f01b05eac101de3536b16c86e765a93.pyx:15:8)
StatListNode.stats[0] = ForInStatNode(

In [105]:
%%cython -a

import cython
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

def Gibbs(long[:,:] Z, long[:] doc_lens, long[:,:] W,
          long[:,:] A, long[:,:] B, long[:,:] C, double[:] p,
         double[:] alpha, double[:] beta, long[:] K):
    num_iter = 30
    M = 2
    # Need gil for some reason: Assignment of Python object not allowed without gil
    for i in range(num_iter): # LOOP 1
        for m in range(M): # LOOP 2
            for n in range(doc_lens[m]): # LOOP 3
                WordTopic = Z[m,n]
                TopicWord = W[m,n]
                A[m, WordTopic] -= 1  # Decrement N1
                B[WordTopic, TopicWord] -= 1 # Decrement N2
                C[WordTopic] -= 1 # Decrement N3
                p = np.zeros(K)
                for k in range(K): # LOOP 4
                    p[k] = (A[m, k] + alpha[k])*((B[k, WordTopic] + beta[TopicWord])/(C[k] + sum(beta)))
                p = p/sum(p) # This is actually doing k divisions... might be a modest speed up but we can parallelize easily with numba
                Z[m,n] = int(np.nonzero(np.random.multinomial(1, p))[0][0])
                A[m, WordTopic] += 1 # Increment N1
                B[WordTopic, TopicWord] += 1 # Increment N2
                C[WordTopic] += 1 # Increment N3
    return A, B

3

### LOOP 4 optimization

In [179]:
@jit(nopython=True, cache = True)
def Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta):
    p = np.zeros(K)
    for k in range(K):
        p[k] = (A[m, k] + alpha[k])*((B[k, WordTopic] + beta[TopicWord])/(C[k] + (beta[1]*len(beta))))
    return p

In [574]:
x = np.repeat([.8,.1,.1], 30000).reshape(-1,3, order = "F")

# x_vec = [int(get_multinom(x[i,:])) for i in range(x.shape[0])]
# get_multinom(x)
x_vec = np.apply_along_axis(get_multinom, axis = 1, arr= x)

np.bincount(x_vec)


array([24022,  2952,  3026])

In [182]:
def Loop4_slow(K, A, B, C, WordTopic, TopicWord, alpha, beta):
    p = np.zeros(K)
    for k in range(K):
        p[k] = (A[m, k] + alpha[k])*((B[k, WordTopic] + beta[TopicWord])/(C[k] + (beta[1]*len(beta))))
    return p

In [249]:
%%cython -a --compile-args=-fopenmp --link-args=-fopenmp --force

import cython
import numpy as np
from cython.parallel import parallel, prange

@cython.boundscheck(False)
@cython.wraparound(False)
def Loop4_cython(K, A, B, C, WordTopic, TopicWord, alpha, beta, m, P):
    with cython.nogil, parallel():
        p = np.zeros(K)
        for k in prange(K):
            p[k] = (A[m, k] + alpha[k])*((B[k, WordTopic] + beta[TopicWord])/(C[k] + (beta[1]*len(beta))))

TypeError: dist must be a Distribution instance

In [211]:
%timeit -r30 -n30 Loop4_slow(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

7.68 µs ± 1.96 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


In [212]:
%timeit -r30 -n30 Loop4(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

2.41 µs ± 285 ns per loop (mean ± std. dev. of 30 runs, 30 loops each)


In [220]:
%timeit -r30 -n30 Loop4_cython(K, A, B, C, Z[m,n], W[m,n], alpha, beta, m) # Without parallelization

7.28 µs ± 3.11 µs per loop (mean ± std. dev. of 30 runs, 30 loops each)


### LOOP 3 optimization

In [217]:
%%cython -a

import cython
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

def test_fun(reps, K, A, B, C, WordTopic, TopicWord, alpha, beta):
    x = np.zeros((reps, K))
    for i in range(reps):
        x[i,:] = Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta)


Error compiling Cython file:
------------------------------------------------------------
...
@cython.wraparound(False)

def test_fun(reps, K, A, B, C, WordTopic, TopicWord, alpha, beta):
    x = np.zeros((reps, K))
    for i in range(reps):
        x[i,:] = Loop4(K, A, B, C, WordTopic, TopicWord, alpha, beta)
                     ^
------------------------------------------------------------

/home/jovyan/.cache/ipython/cython/_cython_magic_d4ab8870b2535c04b60bbb7adf77de82.pyx:11:22: undeclared name not builtin: Loop4


In [None]:
Loop4(K, A, B, C, Z[m,n], W[m,n], alpha, beta)

In [215]:
x = np.zeros((4,3))
x[1,:] = np.array([1,2,3])

In [216]:
x

array([[ 0.,  0.,  0.],
       [ 1.,  2.,  3.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [None]:
theta = np.zeros((M, K))
for m in range(M):
    for k in range(K):
        theta[m , k] = (A[m, k] + alpha[k])/(doc_lens[m] + sum(alpha))
np.round(theta, 2)

In [None]:
np.round(theta_true, 2)

# C++

In [229]:
import cppimport
import pybind11

In [234]:
%%file ex1.cpp
<%
setup_pybind11(cfg)
%>
#include <pybind11/pybind11.h>
namespace py = pybind11;

PYBIND11_MODULE(ex1, m) {
    m.def("add", [](int a, int b) { return a + b; });
    m.def("mult", [](int a, int b) { return a * b; });
}

Overwriting ex1.cpp


In [236]:
import cppimport
ex1 = cppimport.imp("ex1")

ex1.add(3,4)

7

In [237]:
%%file ex9.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['eigen']
setup_pybind11(cfg)
%>

#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>

#include <Eigen/LU>

namespace py = pybind11;

// convenient matrix indexing comes for free
double get(Eigen::MatrixXd xs, int i, int j) {
    return xs(i, j);
}

// takes numpy array as input and returns double
double det(Eigen::MatrixXd xs) {
    return xs.determinant();
}

// takes numpy array as input and returns another numpy array
Eigen::MatrixXd inv(Eigen::MatrixXd xs) {
    return xs.inverse() + xs.det();
}

PYBIND11_MODULE(ex9, m) {
    m.doc() = "auto-compiled c++ extension";
    m.def("inv", &inv);
    m.def("det", &det);
}

Writing ex9.cpp


In [239]:
import numpy as np
%tb
code = cppimport.imp("ex9")

A = np.array([[1,2,1],
              [2,1,0],
              [-1,1,2]])

print(A)
print(code.det(A))
print(code.inv(A))

SystemExit: error: Command "gcc -pthread -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/jovyan/work/Final_Project/STA663_Project/eigen -I/opt/conda/include/python3.5m -I/home/jovyan/.local/include/python3.5m -I/home/jovyan/work/Final_Project/STA663_Project -I/opt/conda/include/python3.5m -c /home/jovyan/work/Final_Project/STA663_Project/.rendered.ex9.cpp -o /tmp/tmprp7lcqls/home/jovyan/work/Final_Project/STA663_Project/.rendered.ex9.o -std=c++11 -std=c++11" failed with exit status 1

SystemExit: error: Command "gcc -pthread -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/jovyan/work/Final_Project/STA663_Project/eigen -I/opt/conda/include/python3.5m -I/home/jovyan/.local/include/python3.5m -I/home/jovyan/work/Final_Project/STA663_Project -I/opt/conda/include/python3.5m -c /home/jovyan/work/Final_Project/STA663_Project/.rendered.ex9.cpp -o /tmp/tmp6tnd4t9q/home/jovyan/work/Final_Project/STA663_Project/.rendered.ex9.o -std=c++11 -std=c++11" failed with exit status 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### Optimizing with JIT

In [685]:
# start gibbs sampler
@jit(nopython=True, cache=False)
def Gibbs2(M, doc_lens, Z, W, K, A, B, C, alpha, beta):
    num_iter = 10
    p = np.zeros(K)
    for i in range(num_iter):
        for m in range(M):
            for n in range(doc_lens[m]):
                A[m, int(Z[m,n])] -= 1  # Decrement N1
                B[int(Z[m,n]), int(W[m,n])] -= 1 # Decrement N2
                C[int(Z[m,n])] -= 1 # Decrement N3
                p = np.zeros(K)
                norm = 0
                for k in range(K):
                    p[k] = (A[m, k] + alpha[k])*((B[k, int(W[m,n])] + beta[int(W[m,n])])/(C[k] + (beta*len(beta))))
                    norm+=p[k]
                p = p/norm # This is actually doing k divisions... might be a modest speed up but we can parallelize easily with numba
                Z[m,n] = int(np.nonzero(np.random.multinomial(1, p))[0][0])
                A[m, int(Z[m,n])] += 1 # Increment N1
                B[int(Z[m,n]), int(W[m,n])] += 1 # Increment N2
                C[int(Z[m,n])] += 1 # Increment N3
    return A
            

In [686]:
# start gibbs sampler
num_iter = 1
for i in range(num_iter):
    for m in range(M):
        for n in range(doc_lens[m]):
            A[m, z[m][n]] -= 1  # Decrement N1
            B[z[m][n], w[m][n]] -= 1 # Decrement N2
            C[z[m][n]] -= 1 # Decrement N3
            p = np.zeros(K)
            for k in range(K):
                p[k] = (A[m, k] + alpha[k])*((B[k, w[m][n]] + beta[w[m][n]])/(C[k] + sum(beta)))
            p /= sum(p) # This is actually doing k divisions... might be a modest speed up but we can parallelize easily with numba
            z[m][n] = np.nonzero(np.random.multinomial(1, p))[0][0]
            A[m, z[m][n]] += 1 # Increment N1
            B[z[m][n], w[m][n]] += 1 # Increment N2
            C[z[m][n]] += 1 # Increment N3

In [None]:
@jit(float64[:](float64[:,:], float64[:], float64[:], float64, int64), nopython=True)

def Gibbs_Sampler( ,num_iter)


In [1]:
%load_ext cython

In [147]:
# doc_lens.dtype
# test_fun(doc_lens)
from numba import njit

In [150]:
@njit(parallel=True)
def f_parallel(x):
    x_new = np.zeros(len(x))
    for i in range(len(x)):
        x_new[i] = np.sqrt(x[i])
    return x_new

In [151]:
X = np.array([1,2,3,4])
f_parallel(X)

KeyError: "Does not support option: 'parallel'"

In [136]:
@jit(nopython=True, parallel=True)
def f(x, y):
    return x + y

In [145]:
# f(np.array([1,2]), np.array([3,4]))

# test(X)