In [1]:
documents = [
    "Artificial intelligence and machine learning are transforming industries.",
    "Machine learning algorithms can analyze massive amounts of data quickly.",
    "Deep learning allows machines to mimic human brain functions.",
    "Natural language processing enables computers to understand human speech.",
    "Data science combines domain expertise, programming, and statistics.",
    "Big data analytics helps companies make better business decisions.",
    "Neural networks are a foundation of modern AI applications.",
    "Computer vision allows machines to interpret and understand images."
]


In [None]:
# 📚 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 📦 Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# 🔥 Step 1: Preprocessing (simple and robust)
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = text.lower().split()
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

processed_docs = [preprocess(doc) for doc in documents]

# 🔥 Step 2: Create Document-Term Matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_docs)

# 🔥 Step 3: Apply LDA
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

# 🔥 Step 4: Display Topics
def display_topics(model, feature_names, no_top_words=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("\n=== LDA Topics ===")
display_topics(lda, vectorizer.get_feature_names_out())




=== LDA Topics ===

Topic 1:
data, big, business, helps, analytics

Topic 2:
learning, data, machine, massive, algorithms

Topic 3:
machines, allows, learning, human, understand


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic 1:
Top Words:
data, analytics, big, companies, helps

📖 Interpretation:
This topic is about Big Data and Analytics.

Words like "big", "data", "analytics", "companies", "helps" suggest:

Companies using big data and analytics

How analytics helps businesses make better decisions

Theme: Business Analytics / Big Data for Decision Support

Topic 2:
Top Words:
learning, data, machine, massive, amounts

📖 Interpretation:
This topic focuses on Machine Learning processing large data.

Words like "learning", "machine", "massive", "amounts" suggest:

Machine Learning algorithms analyzing huge datasets

Handling massive amounts of data through ML techniques

Theme: Machine Learning and Big Data Processing

Topic 3:
Top Words:
allows, machines, learning, human, understand

📖 Interpretation:
This topic talks about Machine Learning + Human-like Intelligence.

Words like "machines", "learning", "human", "understand" suggest:

Machines learning to understand humans

NLP, Computer Vision — areas where machines mimic human behavior

Theme: AI Applications — Understanding Human Behavior

Visualization of Topics

pip install pyLDAvis

In [4]:
# 📚 Imports
import pyLDAvis
from pyLDAvis import prepare

# 🔥 Step 5: pyLDAvis Visualization (after LDA fit)

# Prepare the required inputs
doc_topic_distr = lda.transform(X)
topic_term_distr = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
doc_lengths = X.sum(axis=1).A1
vocab = vectorizer.get_feature_names_out()
term_frequency = np.asarray(X.sum(axis=0)).flatten()

# Create pyLDAvis Panel
panel = prepare(
    topic_term_dists=topic_term_distr,
    doc_topic_dists=doc_topic_distr,
    doc_lengths=doc_lengths,
    vocab=vocab,
    term_frequency=term_frequency
)

# Display inside Jupyter Notebook (if using Jupyter)
pyLDAvis.enable_notebook()
pyLDAvis.display(panel)

# Or if running outside Notebook (save to HTML and open manually)
pyLDAvis.save_html(panel, 'lda_topics_visualization.html')


LSA

In [5]:
# 📚 Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


# 🔥 Step 1: Preprocessing (simple lowercase split, consistent with previous)
processed_docs = [doc.lower() for doc in documents]

# 🔥 Step 2: Create TF-IDF Matrix
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(processed_docs)

# 🔥 Step 3: Apply LSA using TruncatedSVD
lsa = TruncatedSVD(n_components=3, random_state=42)  # 3 topics
lsa.fit(X_tfidf)

# 🔥 Step 4: Display Topics
def display_lsa_topics(model, feature_names, no_top_words=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("\n=== LSA Topics ===")
display_lsa_topics(lsa, vectorizer.get_feature_names_out())



=== LSA Topics ===

Topic 1:
learning, allows, machines, human, machine

Topic 2:
machine, data, learning, algorithms, quickly

Topic 3:
data, combines, science, domain, programming


Topic 1:
Top Words:
learning, allows, machines, human, machine

📖 Interpretation:
This topic is about Artificial Intelligence mimicking human capabilities.

Words like "machines", "human", "allows", and "learning" suggest:

Enabling machines to learn like humans.

Deep Learning, Computer Vision, Natural Language Processing concepts.

✅ Theme:
Machines learning to understand humans (AI/Deep Learning)

Topic 2:
Top Words:
machine, data, learning, algorithms, massive

📖 Interpretation:
This topic talks about Machine Learning applied to Big Data.

Words like "machine", "data", "massive", "algorithms" suggest:

ML models analyzing large-scale datasets.

Topics like Scalability, Big Data ML, Data Mining.

✅ Theme:
Machine Learning for Big Data Analytics

In [None]:
Topic 3:
Top Words:
data, science, programming, domain, expertise

📖 Interpretation:
This topic is focused on Data Science as a discipline.

Words like "science", "programming", "domain", "expertise" suggest:

Importance of Domain Knowledge + Coding Skills in data science.

✅ Theme:
Data Science: Programming, Domain Expertise, and Application