In [8]:
import nltk
import gensim
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from gensim import corpora, models
from nltk import WordNetLemmatizer

In [None]:
#Preparing dataset

In [2]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [None]:
#Text Preprocessing

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
texts = []
for i in doc_set:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(i) for i in stopped_tokens]
    texts.append(lemmatized_tokens)
print(texts)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spends', 'lot', 'time', 'driving', 'brother', 'around', 'baseball', 'practice'], ['health', 'expert', 'suggest', 'driving', 'may', 'cause', 'increased', 'tension', 'blood', 'pressure'], ['often', 'feel', 'pressure', 'perform', 'well', 'school', 'mother', 'never', 'seems', 'drive', 'brother', 'better'], ['health', 'professional', 'say', 'brocolli', 'good', 'health']]


In [4]:
#doc2bow 

In [None]:
#Vectorization
#word 0 appeared 2 times, word 1 appeared 1 time
dictionary = corpora.Dictionary(texts)
bow_corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary)
print(bow_corpus)

#Model building
ldamodel_bow_corpus = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=2, id2word = dictionary, passes=20)
#corpus (iterable of list of (int, float), optional) – Stream of document vectors or sparse matrix of shape 
# (num_documents, num_terms). If you have a CSC in-memory matrix, you can convert it to a streamed corpus with the help of 
# gensim.matutils.Sparse2Corpus. If not given, the model is left untrained (presumably because you want to call update() manually)
# num_topics (int, optional) – The number of requested latent topics to be extracted from the training corpus.
# id2word ({dict of (int, str), gensim.corpora.dictionary.Dictionary}) – Mapping from word IDs to words. 
# It is used to determine the vocabulary size, as well as for debugging and topic printing.
# distributed (bool, optional) – Whether distributed computing should be used to accelerate training.
# chunksize (int, optional) – Number of documents to be used in each training chunk.
# passes (int, optional) – Number of passes through the corpus during training.
# update_every (int, optional) – Number of documents to be iterated through for each update. 
# Set to 0 for batch learning, > 1 for online iterative learning.
# alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of length equal to the number of expected topics that 
# expresses our a-priori belief for each topics’ probability. Alternatively default prior selecting strategies can be employed 
# by supplying a string
# ’symmetric’: Default; uses a fixed symmetric prior per topic,
# ’asymmetric’: Uses a fixed normalized asymmetric prior of 1.0 / (topic_index + sqrt(num_topics)),
# ’auto’: Learns an asymmetric prior from the corpus (not available if distributed==True).
# eta ({float, np.array, str}, optional) –
# A-priori belief on word probability, this can be:
# scalar for a symmetric prior over topic/word probability,
# vector of length num_words to denote an asymmetric user defined probability for each word,
# matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
# the string ‘auto’ to learn the asymmetric prior from the data.
# decay (float, optional) – A number between (0.5, 1] to weight what percentage of the previous lambda value is 
# forgotten when each new document is examined. Corresponds to Kappa from Matthew D. Hoffman, David M. Blei, 
# Francis Bach: “Online Learning for Latent Dirichlet Allocation NIPS‘10”.
# offset (float, optional) – Hyper-parameter that controls how much we will slow down the first steps the first few iterations. 
# Corresponds to Tau_0 from Matthew D. Hoffman, David M. Blei, Francis Bach: “Online Learning for Latent Dirichlet Allocation NIPS‘10”.
# eval_every (int, optional) – Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
# iterations (int, optional) – Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
# gamma_threshold (float, optional) – Minimum change in the value of the gamma parameters to continue iterating.
# minimum_probability (float, optional) – Topics with a probability lower than this threshold will be filtered out.
# random_state ({np.random.RandomState, int}, optional) – Either a randomState object or a seed to generate one. Useful for reproducibility.
# ns_conf (dict of (str, object), optional) – Key word parameters propagated to gensim.utils.getNS() to get a Pyro4 Nameserved. 
# Only used if distributed is set to True.
# minimum_phi_value (float, optional) – if per_word_topics is True, this represents a lower bound on the term probabilities.
# per_word_topics (bool) – If True, the model also computes a list of topics,sorted in descending order of most likely topics 
# for each word, along with their phi values multiplied by the feature length (i.e. word count).
# callbacks (list of Callback) – Metric callbacks to log and visualize evaluation metrics of the model during training.
# dtype ({numpy.float16, numpy.float32, numpy.float64}, optional) – Data-type to use during calculations inside model. 
# All inputs are also converted.

print(ldamodel_bow_corpus.print_topics(num_topics=2, num_words=4))

#Evaluating model
coherence_model_lda = models.CoherenceModel(model=ldamodel_bow_corpus, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

coherence_model_lda = models.CoherenceModel(model=ldamodel_bow_corpus, texts=texts, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [6]:
#Tf idf

In [9]:
#Vectorization
dictionary = corpora.Dictionary(texts)
bow_corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
print(dictionary)
for doc in tfidf_corpus:
    print(doc)


#Model Building
ldamodel_tfidf_corpus = gensim.models.ldamodel.LdaModel(tfidf_corpus, num_topics=2, id2word = dictionary, passes=20)
print(ldamodel_tfidf_corpus.print_topics(num_topics=2, num_words=4))

#Evaluating model
coherence_model_lda = models.CoherenceModel(model=ldamodel_tfidf_corpus, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

coherence_model_lda = models.CoherenceModel(model=ldamodel_tfidf_corpus, texts=texts, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

[(0, 0.40784451109112935), (1, 0.11368521994734913), (2, 0.7163669735975946), (3, 0.40784451109112935), (4, 0.3581834867987973), (5, 0.11368521994734913)]
[(1, 0.12424759593709131), (5, 0.12424759593709131), (6, 0.3914619434234833), (7, 0.3914619434234833), (8, 0.2228684610131362), (9, 0.3914619434234833), (10, 0.3914619434234833), (11, 0.3914619434234833), (12, 0.3914619434234833)]
[(8, 0.2016345105176491), (13, 0.35416512946544426), (14, 0.35416512946544426), (15, 0.35416512946544426), (16, 0.2016345105176491), (17, 0.35416512946544426), (18, 0.35416512946544426), (19, 0.2016345105176491), (20, 0.35416512946544426), (21, 0.35416512946544426)]
[(1, 0.10283764444679584), (5, 0.10283764444679584), (19, 0.18446447498008656), (22, 0.32400646345397865), (23, 0.32400646345397865), (24, 0.32400646345397865), (25, 0.32400646345397865), (26, 0.32400646345397865), (27, 0.32400646345397865), (28, 0.32400646345397865), (29, 0.32400646345397865), (30, 0.32400646345397865)]
[(0, 0.2866473576676298)