<a href="https://colab.research.google.com/github/agawronski/word-embeddings/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from gensim.parsing.preprocessing import STOPWORDS, strip_tags
from gensim.utils import simple_preprocess
from gensim.mofrom gensim.models 
import CoherenceModeldels.ldamodel import LdaModel
from gensim import matutils
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import defaultdict
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import STOPWORDS, strip_tags,strip_numeric, remove_stopwords,strip_short, stem_text
from gensim.models import Phrases
from gensim import corpora, models, similarities
import nltk
import heapq 
import os
import json
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import numpy as np
import pickle
import lda
from gensim.models import Phrases

In [None]:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FLORENCIA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\FLORENCIA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 1. Loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/GLG project/data')

with open('healthcare_industry_from_1900_2021.jsonl') as f:
  lines = f.readlines()

corpus = [json.loads(line)['fullText'] for line in lines ] 
corpus = [' '.join(text)  for text in corpus]
df = pd.DataFrame(corpus, columns=['article' ])
df.head()

Unnamed: 0,article
0,Avian influenza is an acute viral respiratory ...
1,Managing what Cannot be Managed On the Possibi...
2,"References Aaron HJ, “How Not to Reform Medica..."
3,Porcine epidemic diarrhea (PED) was first reco...
4,APPENDIX B Federal Geospatial Data Sources Ide...


### 2. Data Cleaning

Remove digits, transform to lowercase, remove stopwords

We find bigrams in the documents. Bigrams are sets of two adjacent words. Using bigrams we can get phrases like “machine_learning” in our output (spaces are replaced with underscores); without bigrams we would only get “machine” and “learning”.

Note that in the code below, we find bigrams and then add them to the original data, because we would like to keep the words “machine” and “learning” as well as the bigram “machine_learning”.

In [None]:
def preprocessing(corpus):
    custom_filters=[lambda x:x.lower(), 
                    strip_multiple_whitespaces,
                    strip_numeric, #Remove digits from s using RE_NUMERIC.
                    remove_stopwords]
    tokenized_docs = [preprocess_string(doc, custom_filters) for doc in corpus]
    stop_words=set(stopwords.words("english"))
    tokenized_docs = [[token for token in text if token not in stop_words] for text in tokenized_docs]
    lemmatizer = WordNetLemmatizer()
    tokenized_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in tokenized_docs]
    bigram = Phrases(tokenized_docs, min_count=2)
    for idx in range(len(tokenized_docs)):
        for token in bigram[tokenized_docs[idx]]:
            if '_' in token:
            # Token is a bigram, add to document.
                tokenized_docs[idx].append(token)
    trigram = Phrases(bigram[tokenized_docs], min_count=2)
    for idx in range(len(tokenized_docs)):
        for token in trigram[tokenized_docs[idx]]:
            if '_' in token:
            # Token is a bigram, add to document.
                tokenized_docs[idx].append(token)
    return tokenized_docs

In [None]:
tokenized_docs=preprocessing(corpus)

### 3. Exploratory Analysis
We’ll make a word cloud using the wordcloud package to get a visual representation of most common words. 

In [None]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in tokenized_docs:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than one
tokenized_docs = [[token for token in text if frequency[token] > 1] for text in tokenized_docs]

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
fdist = FreqDist()
for text in tokenized_docs:
    for token in text:
        fdist[token.lower()] += 1
        

# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)

plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
       wordcloud = WordCloud(
           background_color='white',
           stopwords=stopwords,
           max_words=100, #mask=arg_mask, contour_width=3, contour_color='steelblue',
           max_font_size=40, min_font_size=6,
           scale=3,relative_scaling=1,
           random_state=1 # chosen at random by flipping a coin; it was heads
       ).generate_from_frequencies(data)

       fig = plt.figure(1, figsize=(12, 12))
       plt.axis('off')
       if title: 
           fig.suptitle(title, fontsize=20)
           fig.subplots_adjust(top=2.3)

       plt.imshow(wordcloud)
       plt.show()
        
show_wordcloud(fdist, title = None)

### 4. Prepare data for LDA Analysis

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(tokenized_docs)
print(id2word)

We remove rare words and common words based on their document frequency. 
Below we remove words that appear in less than 20 documents or in more than 50% of the documents. Consider trying to remove words only based on their frequency, or maybe combining that with this approach

In [None]:
# Filter out words that occur more than 80% of the documents.
id2word.filter_extremes(no_above=0.8)

In [None]:
#covert tokenized documnent to vectors Term Document Frequency
corpus_2=[id2word.doc2bow(text) for text in tokenized_docs]
# Create Corpus
texts = tokenized_docs

In [None]:
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(texts))

Number of unique tokens: 100000
Number of documents: 1500


### 5. Build LDA model

We’ll keep all the parameters to default except for inputting the number of topics. 
we will build a model with 10 topics 

In [None]:
#number of topics
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus_2,
                                       id2word=id2word,
                                       num_topics=num_topics, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus_2]

[(0,
  '0.011*"woman" + 0.003*"women’s" + 0.003*"book" + 0.003*"like" + '
  '0.003*"new_york" + 0.003*"feminist" + 0.003*"lesbian" + 0.003*"black" + '
  '0.002*"life" + 0.002*"american"'),
 (1,
  '0.008*"medical" + 0.007*"patient" + 0.005*"care" + 0.004*"public_health" + '
  '0.004*"mental_health" + 0.004*"disease" + 0.004*"virus" + 0.004*"drug" + '
  '0.004*"treatment" + 0.003*"clinical"'),
 (2,
  '0.005*"global" + 0.004*"data" + 0.004*"technology" + 0.003*"company" + '
  '0.003*"investment" + 0.003*"trade" + 0.003*"market" + 0.003*"sector" + '
  '0.003*"business" + 0.003*"european"'),
 (3,
  '0.010*"united_state" + 0.008*"security" + 0.006*"united" + 0.006*"military" '
  '+ 0.005*"u.s." + 0.004*"russian" + 0.004*"china" + 0.004*"nuclear" + '
  '0.004*"strategic" + 0.004*"foreign"'),
 (4,
  '0.005*"local" + 0.004*"rom" + 0.004*"programme" + 0.004*"community" + '
  '0.004*"care" + 0.004*"public_health" + 0.004*"education" + 0.004*"hospital" '
  '+ 0.003*"ministry" + 0.003*"()."'),
 (5,

### 5. Analyzing LDA model results

Probabilistic topic models, such as LDA, are popular tools for text analysis, providing both a predictive and latent topic representation of the corpus. 
However, there is a longstanding assumption that the latent space discovered by these models is generally meaningful and useful, and that evaluating such assumptions is challenging due to its unsupervised training process.
There is a no-gold standard list of topics to compare against every corpus.

Let’s take a look at roughly what approaches are commonly used for the evaluation:

 **Eye Balling Models**
: Top N words
: Topics / Documents

 **Intrinsic Evaluation Metrics**
: Capturing model semantics
: Topics interpretability

 **Extrinsic Evaluation Metrics/Evaluation at task**
: Is model good at performing predefined tasks, such as classification

Let’s visualize the topics for interpretability. 
To do so, we’ll use a popular visualization package, pyLDAvis which is designed to help interactively better understanding
and interpreting  individual topics and better understanding the relationships between topics.


You can manually select each topic to view its top most frequent and/or “relevant” terms, using different values of the λ parameter. 
This can help when you’re trying to assign a human interpretable name or “meaning” to each topic.
Exploring the Intertopic Distance Plot can help you learn about how topics relate to each other, including potential higher-level structure between groups of topics.


https://pyldavis.readthedocs.io/en/latest/modules/API.html

In [None]:
### Visualizing the topics
#import graphlab as gl
import pyLDAvis
import pyLDAvis.graphlab
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda_model, corpus_2, id2word)

**Topic Coherence** measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference.

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.571362409524663


### 6. Hyperparameter tuning¶

Now that we have the baseline coherence score for the default LDA model, let’s perform a series of sensitivity tests to help determine the following model hyperparameters:

1. Number of Topics (K)
2. Dirichlet hyperparameter alpha: Document-Topic Density
3. Dirichlet hyperparameter beta: Word-Topic Density

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 30
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus_2)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

### 7. Final Model

In [None]:
# Build LDA model
import gensim
from gensim import models
lda_model_v2 = gensim.models.LdaMulticore(corpus=corpus_2,
                                       id2word=id2word,
                                       num_topics=30, #number 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
# Compute Coherence Score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model_v2, texts=tokenized_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5759584637009776


In [None]:
from gensim.models.coherencemodel import CoherenceModel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=lda_model_v2
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(texts=tokenized_docs, dictionary=id2word, start=2, limit=30, step=6)


In [None]:

# Show graph
import matplotlib.pyplot as plt
limit=30; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
