In [2]:
%matplotlib inline
import gzip
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pystan
import lda
import lda.datasets
import matplotlib.cm as cm
import cPickle as cpkl

In [24]:
#loading the preprocessed data. See data_processing.ipynb for details
with open('preprocessed_data.cpkl','r') as f:
    data = cpkl.load(f)
    
x_train = data['x_train']     #one hot matrix (samples x genes)
t_train = data['t_train']     #labels from bioinformatician 
genes_id = data['genes_id']   
genes = data['genes']
sampleid = data['sampleid']   #sampleid and words contain the same information as x_train just encoded as sampleid: cell, words: id for seen words  
words = data['words']



In [None]:
# ==============================================================================
LDA_uniprior = """

data {
  int<lower=2> K;               // num topics
  int<lower=2> V;               // num words
  int<lower=1> M;               // num docs
  int<lower=1> N;               // total word instances
  int<lower=1,upper=V> w[N];    // word n
  int<lower=1,upper=M> doc[N];  // doc ID for word n
  vector<lower=0>[K] alpha;     // topic prior
  vector<lower=0>[V] beta;      // word prior
}
parameters {
  simplex[K] theta[M];   // topic dist for doc m
  simplex[V] phi[K];     // word dist for topic k
}
model {
  for (m in 1:M)  
    theta[m] ~ dirichlet(alpha);  // prior
  for (k in 1:K)  
    phi[k] ~ dirichlet(beta);     // prior
  for (n in 1:N) {
    real gamma[K];
    for (k in 1:K) 
      gamma[k] <- log(theta[doc[n],k]) + log(phi[k,w[n]]);
    increment_log_prob(log_sum_exp(gamma));  // likelihood
  }
}
"""

M,V  = x_train.shape
K=5
V=48
M=1430
data = dict(K=K,
        V=V,
        M=M,
        N=np.sum(x_train),
        w=words,
        doc=sampleid,
        alpha=np.ones(K,)*(1./K),
        beta=np.ones(V,)*(1./V),
           )

fit_uni_prior = pystan.stan(model_code=LDA_uniprior, data=data)


samples = fit_uni_prior.extract()

In [12]:
LDA_informative_prior = """

data {
  int<lower=2> K;               // num topics
  int<lower=2> V;               // num words
  int<lower=1> M;               // num docs
  int<lower=1> N;               // total word instances
  int<lower=1,upper=V> w[N];    // word n
  int<lower=1,upper=M> doc[N];  // doc ID for word n
  vector<lower=0>[K] alpha;     // topic prior
  vector<lower=0>[V] beta;      // word prior
}
parameters {
  simplex[K] theta[M];   // topic dist for doc m
  simplex[V] phi[K];     // word dist for topic k
}
model {
  for (m in 1:M)  
    theta[m] ~ dirichlet(alpha);  // prior
  for (k in 1:K)  
    phi[k] ~ dirichlet(beta);     // prior
  for (n in 1:N) {
    real gamma[K];
    for (k in 1:K) 
      gamma[k] <- log(theta[doc[n],k]) + log(phi[k,w[n]]);
    increment_log_prob(log_sum_exp(gamma));  // likelihood
  }
}
"""


M,V  = x_train.shape
K=5
V=48
M=1430
data = dict(K=K,
        V=V,
        M=M,
        N=np.sum(x_train),
        w=words,
        doc=sampleid,
        alpha=np.ones(K,)*(1./K),
        beta=(1+x_train.sum(axis=1).astype('float32'))/x_train.sum(),
           )

fit_infor_prior = pystan.stan(model_code=LDA_informative_prior, data=data)

fit_infor_prior.extract()

48