# 8. Topic Modeling

Topic modeling is a technique in natural language processing (NLP) that helps uncover the underlying themes or topics in a collection of documents. This can be incredibly useful for summarizing, organizing, and understanding large volumes of text.


In [11]:
# Prepare Your Environment

!pip install nltk scikit-learn gensim matplotlib pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [12]:
#  Load and Preprocess the Data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample documents
documents = [
    "Machine learning is fascinating.",
    "Natural language processing enables computers to understand human language.",
    "Topic modeling is a technique used to discover abstract topics.",
    # Add more documents
]

# Tokenization and preprocessing function
def preprocess_text(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text.lower())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return tokens

# Preprocess all documents
preprocessed_docs = [preprocess_text(doc) for doc in documents]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
#  Create a Document-Term Matrix

from sklearn.feature_extraction.text import CountVectorizer

# Convert preprocessed docs back to text for CountVectorizer
processed_docs_text = [' '.join(doc) for doc in preprocessed_docs]

# Create Document-Term Matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_docs_text)

# Get feature names
feature_names = vectorizer.get_feature_names_out()


In [14]:
# Apply Topic Modeling

from sklearn.decomposition import LatentDirichletAllocation

# Number of topics
num_topics = 3

# Apply LDA
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(X)

# Get topic-word distribution
topic_word_dist = lda.components_


In [18]:
# . Examine the Topics

# Display top words for each topic
num_words = 10
for topic_idx, topic in enumerate(topic_word_dist):
    top_words_idx = topic.argsort()[-num_words:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")


Topic 0: languag, understand, process, natur, human, enabl, comput, fascin, machin, learn
Topic 1: topic, use, techniqu, abstract, model, discov, machin, learn, fascin, understand
Topic 2: machin, learn, fascin, understand, process, natur, human, enabl, comput, languag


  and should_run_async(code)
