In [1]:
# Load libraries
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

In [24]:
def generate_dists(alpha, beta, M, K, V):
    """Generates topic and word distributions"""
    
    # Generate word distributions
    phi = np.zeros((K, V))
    for k in range(K):
        phi[k, :] = np.random.dirichlet(beta)
    
    # Generate topic distributions
    theta = np.zeros((M, K))
    for m in range(M):
        theta[m,:] = np.random.dirichlet(alpha)
    
    return((phi, theta))

In [25]:
def generate_words(phi, theta, M, N_min, N_max):
    """Generates 'words' for corpus"""
    
    doc_lens = np.random.randint(N_min, N_max, M)
    z = {}
    w = {}
    for m in range(M):
        z[m] = []
        w[m] = []
        for n in range(doc_lens[m]):
            z[m].extend(np.nonzero(np.random.multinomial(1, theta[m,:]))[0])
            w[m].extend(np.nonzero(np.random.multinomial(1, phi[z[m][n], :]))[0])
    
    return w

In [26]:
def make_bow(w, M, V):
    """Creates bag-of-words matrix from corpus"""
    
    bow = np.zeros((M, V))
    for m in range(M):
        for v in range(V):
            bow[m, v] = len(np.where(np.array(w[m]) == v)[0])
    
    return bow

In [27]:
def simulate_corpus(alpha, beta, M, N_min, N_max):
    """Generates test data for LDA"""
    
    # Get corpus parameters
    K = len(alpha)
    V = len(beta)
    
    # Generate topic and word distributions
    phi, theta = generate_dists(alpha, beta, M, K, V)
    
    # Generate words
    w = generate_words(phi, theta, M, N_min, N_max)
    
    # Make bag-of-words matrix
    bow = make_bow(w, M, V)
    
    return((bow, phi, theta))

In [28]:
def get_newsgroups(categories = None, n_articles = 10):
    """Fetches random newsgroups articles of specified categories"""
    
    remove = ('headers', 'footers', 'quotes')
    newsgroups = fetch_20newsgroups(subset = 'train', remove = remove, categories = categories)
    
    ind = np.random.choice(len(newsgroups.data), size = n_articles, replace = False)
    news = [newsgroups.data[i] for i in ind]
    labels = [newsgroups.target[i] for i in ind]
    
    words = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        news]

    vectorizer = CountVectorizer()
    vectorizer.fit(words)
    wordbank = vectorizer.get_feature_names()
    
    bow_sparse = vectorizer.transform(words)
    bow = np.array(csr_matrix.todense(bow_sparse))
    
    return (bow, labels, wordbank)

In [15]:
remove = ('headers', 'footers', 'quotes')
newsgroups = fetch_20newsgroups(subset = 'train', remove = remove, categories = ['comp.graphics', 'soc.religion.christian'])

In [16]:
ind = np.random.choice(len(newsgroups.data), size = 3, replace = False)
news = [newsgroups.data[i] for i in ind]
labels = [newsgroups.target[i] for i in ind]
words = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        news]

In [17]:
words

['so does that mean that anyone who is a christian to avoid hell really a christian at it sounds like it to mit liebe in martyn martyn department of applied mathematics and theoretical the university of box ergo deus',
 'hi i am looking for a polygon fill routine to fill simple sided polygons can some one who has this routine in c help me in saving my thanx in advance',
 'a question for you can you give me the name of an organization or a philosophy or a political which has never had anything evil done in its missing a central teaching of christianity man is inherently we are saved through faith by knowing believing does not make us without not all who consider themselves are those who manage to head their own everyone who says to will enter the kingdom of but only he who does the will of my father who is in what historical documents do you do you think hannibal crossed the how do you how do you know for what historical documents have stood the scrutiny and the attempts to credit it as

In [22]:
vectorizer = CountVectorizer()
vectorizer.fit(words)
vectorizer.vocabulary

In [14]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

ModuleNotFoundError: No module named 'nltk'