In [31]:
# Load libraries
import numpy as np
from LDA_AandB.test_data_generator import simulate_corpus
from LDA_AandB.lda_code import lda, group_docs
np.random.seed(101)

In [32]:
# Set corpus parameters
V = 100
N_min = 150
N_max = 200
K = 2
M = 10

# Set hyperparameters
alpha_true = np.random.randint(1, 10, K)
beta_true = np.random.randint(1, 10, V)

# Generate simulated corpus
bow, theta_true, phi_true = simulate_corpus(alpha_true, beta_true, M, N_min, N_max)

# LSA

In [33]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

svd = TruncatedSVD(n_components = 2, n_iter = 7, random_state = 42)
TruncatedSVD(algorithm = 'randomized', n_components = 2, n_iter = 7,
        random_state = 42, tol = 0.0)
svd.fit_transform(bow)

array([[21.76158621, 12.25379071],
       [22.78082259, -0.51321017],
       [17.60798686, -2.09722723],
       [20.63135653, -0.02359774],
       [21.75064639,  2.49741488],
       [20.45242313, -6.32869104],
       [19.09948169, -5.17927841],
       [20.30089935,  2.0238676 ],
       [16.78600481, -3.16631302],
       [17.95393316, -1.75193616]])

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
documents = ["doc1.txt apples", "doc2.txt", "doc3.txt"] 
  
# raw documents to tf-idf matrix: 
vectorizer = TfidfVectorizer(stop_words = 'english', 
                             use_idf = True, 
                             smooth_idf = True)

# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components = 2,        
                         algorithm = 'randomized',
                         n_iter = 10)

# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(documents)

# svd_matrix can later be used to compare documents, compare words, or compare queries with documents

In [6]:
svd_matrix

array([[ 0.64356876,  0.7653883 ],
       [ 0.7144795 , -0.34471248],
       [ 0.7144795 , -0.34471248]])

In [10]:
import gensim
mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')

In [16]:
lsi = gensim.models.lsimodel.LsiModel(corpus = np.array(bow, dtype = int), num_topics = 2)

TypeError: 'numpy.int64' object is not iterable

# LDA (Variational Bayes)

In [38]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
# This produces a feature matrix of token counts, similar to what
# CountVectorizer would produce on text.
#X, _ = make_multilabel_classification(random_state=0)
lda = LatentDirichletAllocation(n_components = 2,
                                random_state = 0)
lda.fit(bow) 
LatentDirichletAllocation(...)
# get topics for some given samples:
lda.transform(bow)



array([[0.99658069, 0.00341931],
       [0.99674963, 0.00325037],
       [0.9958288 , 0.0041712 ],
       [0.99631221, 0.00368779],
       [0.99650759, 0.00349241],
       [0.99655016, 0.00344984],
       [0.99629708, 0.00370292],
       [0.9964495 , 0.0035505 ],
       [0.99588775, 0.00411225],
       [0.9959005 , 0.0040995 ]])

# LDA (Collapsed Gibbs Sampler)