 LDA, GSDMM, and NMF.

# Non-Negative Matrix Factorization (NMF)

In [None]:
!pip install gensim

import nltk
from nltk.stem import *

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Prepare list of stopwords for removal
stopwords = set(nltk.corpus.stopwords.words('english'))

import pandas as pd
import numpy as np
# NMF implementation in sklearn
from sklearn.decomposition import NMF 

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer


from scipy import linalg # For linear algebra operations
import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(suppress=True)

In [22]:
file = 'The Little Prince.txt'
docs = []
docs_sen = []
last = 0
with open(file, "r", encoding="utf-8") as file1:
    FileContent = file1.read()
    FileContent = FileContent.lower() # lowercase
    for i in range(len(FileContent)):
      if FileContent[i] == '.' and FileContent[i+1] == ' ':
        docs.append(FileContent[last:i+1]) 
        docs_sen.append(FileContent[last:i+1]) 
        last = i+1

    # Lemmatization
    dict_tag2cat = {'ADJ':'a', 'NOUN':'n', 'ADP':' ', 'VERB':'v', 'ADV':' ',
          'ADP':' ', 'CONJ':' ', 'INTJ':' ', 'PRON':'n'}
    cate = list(dict_tag2cat.keys())
    for sen in range(len(docs)):
      words = nltk.tokenize.word_tokenize(docs[sen]) # get list of words
      after_lem_pos_tag = nltk.pos_tag(words, tagset='universal')
      after_lem = []
      for i in after_lem_pos_tag:
        cate_now = i[1]
        if cate_now in cate:
          if dict_tag2cat[i[1]] != ' ':
            after_lem.append(lemmatizer.lemmatize(i[0],pos=dict_tag2cat[i[1]]))
            # a for Adjective； v for Verb； n for Noun； r for adverb;
          else:
            after_lem.append(lemmatizer.lemmatize(i[0]))
        else:
          after_lem.append(lemmatizer.lemmatize(i[0]))
      docs_sen[sen] = ' '.join(str(e) for e in after_lem)
      docs[sen] = after_lem

In [32]:
df = pd.DataFrame(docs_sen, columns=['text'])
df.head()

Unnamed: 0,text
0,the little prince chapter 1 i be six year old ...
1,the picture show a boa constrictor in the act ...
2,this be what it look like : it say in the book...
3,"after this , it be unable to move and sleep th..."
4,"then , with care and a coloured pencil , i suc..."


In [39]:
# Initialize TFIDF Vectorizer
vect= TfidfVectorizer(
    min_df=10,
    max_df=0.85,
    max_features=5000,
    ngram_range=(1, 3), # Include unigrams, bi-grams and tri-grams
    stop_words='english' # Remove Stopwords
)

# Apply Transformation
X = vect.fit_transform(df.text)

# Create an NMF instance with 4 components
model = NMF(n_components=15, init='nndsvd', random_state=42)

# Fit the model
model.fit(X)

In [40]:
# Features Matrix
nmf_features = model.transform(X)

# Components Matrix
components_df = pd.DataFrame(model.components_, 
                            columns=vect.get_feature_names_out())

terms =  list(vect.get_feature_names_out())

In [63]:
# Top 20 words of importance in each of the topics

for topic in range(components_df.shape[0]):
    value_list = []
    topic_df = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    # print(topic_df.nlargest(20))
    print(topic_df.nlargest(20).keys())
    print('\n')

For topic 1 the words with the highest value are:
Index(['little', 'prince', 'little prince', 'say little prince', 'say little',
       'reply', 'people', 'ask', 'chapter', 'oh', 'feel', 'day', 'hello',
       'ask little', 'ask little prince', 'saw', 'baobab', 'right', 'really',
       'question'],
      dtype='object')


For topic 2 the words with the highest value are:
Index(['like', 'felt', 'just', 'look', 'rise', 'fox', 'sand', 'thousand',
       'grown ups', 'ups', 'important', 'grown', 'lamplighter', 'away',
       'thing', 'leave', 'begin', 'number', 'old', 'order'],
      dtype='object')


For topic 3 the words with the highest value are:
Index(['make', 'try', 'feel', 'happy', 'beautiful', 'order', 'saw', 'old',
       'continue', 'love', 'draw', 'year', 'sand', 'repeat', 'day', 'just',
       'question', 'snake', 'right', 'mean'],
      dtype='object')


For topic 4 the words with the highest value are:
Index(['flower', 'eat', 'love', 'volcano', 'important', 'sheep', 'chapter

# BERTopic

In [None]:
!pip install bertopic
from bertopic import BERTopic
from bertopic import BERTopic

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from scipy import linalg
import gensim
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
# np.set_printoptions(suppress=True)
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # downloading wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

In [21]:
file = 'The Little Prince.txt'
docs = []
last = 0
with open(file, "r", encoding="utf-8") as file1:
    FileContent = file1.read()
    FileContent = FileContent.lower() # lowercase
    for i in range(len(FileContent)):
      if FileContent[i] == '.' and FileContent[i+1] == ' ':
        docs.append(FileContent[last:i+1]) 
        last = i+1

    # Lemmatization
    dict_tag2cat = {'ADJ':'a', 'NOUN':'n', 'ADP':' ', 'VERB':'v', 'ADV':' ',
          'ADP':' ', 'CONJ':' ', 'INTJ':' ', 'PRON':'n'}
    cate = list(dict_tag2cat.keys())
    for sen in range(len(docs)):
      words = nltk.tokenize.word_tokenize(docs[sen]) # get list of words
      after_lem_pos_tag = nltk.pos_tag(words, tagset='universal')
      after_lem = []
      for i in after_lem_pos_tag:
        cate_now = i[1]
        if cate_now in cate:
          if dict_tag2cat[i[1]] != ' ':
            after_lem.append(lemmatizer.lemmatize(i[0],pos=dict_tag2cat[i[1]]))
            # a for Adjective； v for Verb； n for Noun； r for adverb;
          else:
            after_lem.append(lemmatizer.lemmatize(i[0]))
        else:
          after_lem.append(lemmatizer.lemmatize(i[0]))
      docs[sen] = ' '.join(str(e) for e in after_lem)

    # use BERTopic to extract topic
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english") #  
    topic_model = BERTopic(vectorizer_model=vectorizer_model) # 

    # Fit and Transform
    topics, probs = topic_model.fit_transform(docs)

    # # Reduce the number of topics after training the model
    # new_topics, new_probs = model.reduce_topics(docs, topics, probs, nr_topics=5)

In [22]:
freq = topic_model.get_topic_info() # 13
freq

Unnamed: 0,Topic,Count,Name
0,-1,198,-1_little_say_prince_little prince
1,0,91,0_little prince_prince_little_say
2,1,74,1_come_sad_clear_leave
3,2,50,2_friend_hand_know_afraid
4,3,29,3_sheep_flower_eat_draw sheep
5,4,29,4_star_businessman_make_million
6,5,28,5_baobab_seed_little_planet
7,6,28,6_king_yawn_say king_command
8,7,28,7_flower_caterpillar_flower flower_butterfly
9,8,26,8_boa_draw_constrictor_boa constrictor


In [46]:
# We access index 4 for topic 3 because we skip index 0 which is topic -1
topic_model.get_topic(freq.iloc[1]["Topic"])

[('little prince', 0.04867027178980886),
 ('prince', 0.04843643643184922),
 ('little', 0.04688853691206062),
 ('say', 0.032556400232163246),
 ('say little prince', 0.021027499682709633),
 ('say little', 0.02089637387639448),
 ('snake', 0.015012810862130656),
 ('hello', 0.013640135665695738),
 ('chapter', 0.012470770742387338),
 ('flower', 0.01228362201264217)]

In [40]:
# Topic Word Scores in Bar Chart
topic_model.visualize_barchart()

In [44]:
# Intertopic Distance Map
topic_model.visualize_topics( )

In [45]:
# Similarity Matrix
topic_model.visualize_heatmap( )

In [47]:
# Hierarchical Topics
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 14/14 [00:00<00:00, 183.35it/s]


In [51]:
file = 'The Little Prince.txt'
docs = []
last = 0
with open(file, "r", encoding="utf-8") as file1:
    FileContent = file1.read()
    FileContent = FileContent.lower() # lowercase
    for i in range(len(FileContent)):
      if FileContent[i] == '.' and FileContent[i+1] == ' ':
        docs.append(FileContent[last:i+1]) 
        last = i+1

    # Lemmatization
    dict_tag2cat = {'ADJ':'a', 'NOUN':'n', 'ADP':' ', 'VERB':'v', 'ADV':' ',
          'ADP':' ', 'CONJ':' ', 'INTJ':' ', 'PRON':'n'}
    cate = list(dict_tag2cat.keys())
    for sen in range(len(docs)):
      words = nltk.tokenize.word_tokenize(docs[sen]) # get list of words
      after_lem_pos_tag = nltk.pos_tag(words, tagset='universal')
      after_lem = []
      for i in after_lem_pos_tag:
        cate_now = i[1]
        if cate_now in cate:
          if dict_tag2cat[i[1]] != ' ':
            after_lem.append(lemmatizer.lemmatize(i[0],pos=dict_tag2cat[i[1]]))
            # a for Adjective； v for Verb； n for Noun； r for adverb;
          else:
            after_lem.append(lemmatizer.lemmatize(i[0]))
        else:
          after_lem.append(lemmatizer.lemmatize(i[0]))
      docs[sen] = ' '.join(str(e) for e in after_lem)

    # use BERTopic to extract topic
    vectorizer_model = CountVectorizer( stop_words="english") #  ngram_range=(1, 3),
    topic_model = BERTopic(vectorizer_model=vectorizer_model) # 

    # Fit and Transform
    topics, probs = topic_model.fit_transform(docs)

    # # Reduce the number of topics after training the model
    # new_topics, new_probs = model.reduce_topics(docs, topics, probs, nr_topics=5)

In [53]:
# We access index 4 for topic 3 because we skip index 0 which is topic -1
topic_model.get_topic(freq.iloc[2]["Topic"])

[('sad', 0.0712661029061958),
 ('bite', 0.06870627874853207),
 ('leave', 0.06649180834431676),
 ('fault', 0.0653174463041281),
 ('hard', 0.0653174463041281),
 ('come', 0.06518646025462484),
 ('time', 0.06461725627330991),
 ('don', 0.05857179314582704),
 ('skin', 0.052854272764079786),
 ('comfort', 0.052854272764079786)]