# Create Dummy Textual Dataset

In [2]:
documents = [
    "I love football and cricket. They are the most exciting sports.",
    "Messi is a great football player, and he has won many trophies.",
    "Python and Java are powerful programming languages used in software development.",
    "Data science is fun with Python. You can do machine learning, deep learning, and more.",
    "Cricket is a popular sport in Asia, especially in India, Pakistan, and Bangladesh.",
    "Football is more popular in Europe, with clubs like Barcelona and Manchester United.",
    "Artificial Intelligence and Machine Learning are changing the world.",
    "Programming in Python is simple and beginner-friendly.",
    "Cricket matches in the IPL are watched by millions of fans every year.",
    "Messi and Ronaldo are legendary footballers with amazing careers.",
    "Java is used in Android app development and enterprise-level systems.",
    "Many data scientists prefer Python over other languages due to its simplicity.",
    "T20 cricket is very popular because of its short and exciting format.",
    "Deep learning and neural networks are subfields of machine learning.",
    "The FIFA World Cup is the most famous football tournament in the world.",
    "Libraries like Pandas, NumPy, and Scikit-learn make data analysis easier.",
    "India vs Pakistan cricket matches are always intense and full of emotion.",
    "Barcelona is one of the greatest football clubs in the world.",
    "Using Python, one can build web apps, automate tasks, and analyze data.",
    "Computer programming can be both a career and a creative hobby."
]

documents

['I love football and cricket. They are the most exciting sports.',
 'Messi is a great football player, and he has won many trophies.',
 'Python and Java are powerful programming languages used in software development.',
 'Data science is fun with Python. You can do machine learning, deep learning, and more.',
 'Cricket is a popular sport in Asia, especially in India, Pakistan, and Bangladesh.',
 'Football is more popular in Europe, with clubs like Barcelona and Manchester United.',
 'Artificial Intelligence and Machine Learning are changing the world.',
 'Programming in Python is simple and beginner-friendly.',
 'Cricket matches in the IPL are watched by millions of fans every year.',
 'Messi and Ronaldo are legendary footballers with amazing careers.',
 'Java is used in Android app development and enterprise-level systems.',
 'Many data scientists prefer Python over other languages due to its simplicity.',
 'T20 cricket is very popular because of its short and exciting format.',
 'De

# Preprocessing (cleaning Text)


In [None]:
#preprocessing steps
#Lowercasing : convert  all characters to lowercase
#Removing Punctuation: remove punctuation marks
#Removing Stopwords:Remove Common stopwords Like 'and','the',etc
#Tokenization:split text into individual words
#Stemming/Lemmatization: Reduce words to their root from (optional)



In [17]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()


    #*&%$#@! remove
    text = re.sub(r"[^\w\s]",'',text)
    
    #tokenization 
    tokens = text.split()
    
    #stop word remove
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)


preprocessed_documnets = [preprocess_text(doc) for doc in documents]

In [18]:
preprocessed_documnets

['love football cricket exciting sports',
 'messi great football player won trophies',
 'python java powerful programming languages used software development',
 'data science fun python machine learning deep learning',
 'cricket popular sport asia especially india pakistan bangladesh',
 'football popular europe clubs like barcelona manchester united',
 'artificial intelligence machine learning changing world',
 'programming python simple beginnerfriendly',
 'cricket matches ipl watched millions fans year',
 'messi ronaldo legendary footballers amazing careers',
 'java used android app development enterpriselevel systems',
 'data scientists prefer python languages simplicity',
 't20 cricket popular short exciting format',
 'deep learning neural networks subfields machine learning',
 'fifa world cup famous football tournament world',
 'libraries like pandas numpy scikitlearn make data analysis easier',
 'india vs pakistan cricket matches intense emotion',
 'barcelona greatest football cl

# Countvectorizer (Text to numeric)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

x = vectorizer.fit_transform(preprocessed_documnets)

In [21]:
x.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

# apply LDA

In [23]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2 , random_state=0)
lda.fit(x)

# Display topic

In [25]:
lda.components_

array([[1.4963401 , 0.50348532, 1.49738385, 0.50289415, 0.50289415,
        1.49738385, 1.49645968, 0.50261262, 1.49738385, 0.50261262,
        2.49582873, 0.50723709, 1.49738385, 0.5041122 , 1.4963401 ,
        1.49645968, 2.49582873, 0.5041122 , 0.5041122 , 0.50649706,
        1.49692356, 3.54873916, 2.49698631, 0.50314717, 0.50348532,
        0.50285897, 0.50289415, 0.50261262, 1.49614865, 0.5120184 ,
        1.49692356, 0.50290202, 1.49692356, 5.00524504, 1.4963401 ,
        0.50354787, 1.49715153, 1.4963455 , 1.49573263, 0.5041122 ,
        0.50281058, 1.49645968, 0.50285897, 0.50290202, 0.50314717,
        1.4881754 , 5.49683498, 1.4963401 , 0.50348532, 1.49726625,
        0.52050722, 3.49676072, 0.50348532, 1.49614865, 0.5029594 ,
        2.49624205, 0.50290202, 1.49698179, 1.49698179, 0.50348532,
        0.50281058, 0.50348532, 1.4963455 , 1.46782851, 0.50323246,
        1.49518586, 0.50500821, 3.56418279, 1.4963401 , 1.49715153,
        1.49518586, 0.50348532, 0.50354787, 0.50

In [27]:
vectorizer.get_feature_names_out()

array(['amazing', 'analysis', 'analyze', 'android', 'app', 'apps',
       'artificial', 'asia', 'automate', 'bangladesh', 'barcelona',
       'beginnerfriendly', 'build', 'career', 'careers', 'changing',
       'clubs', 'computer', 'creative', 'cricket', 'cup', 'data', 'deep',
       'development', 'easier', 'emotion', 'enterpriselevel',
       'especially', 'europe', 'exciting', 'famous', 'fans', 'fifa',
       'football', 'footballers', 'format', 'fun', 'great', 'greatest',
       'hobby', 'india', 'intelligence', 'intense', 'ipl', 'java',
       'languages', 'learning', 'legendary', 'libraries', 'like', 'love',
       'machine', 'make', 'manchester', 'matches', 'messi', 'millions',
       'networks', 'neural', 'numpy', 'pakistan', 'pandas', 'player',
       'popular', 'powerful', 'prefer', 'programming', 'python',
       'ronaldo', 'science', 'scientists', 'scikitlearn', 'short',
       'simple', 'simplicity', 'software', 'sport', 'sports', 'subfields',
       'systems', 't20', 'tas

In [28]:
for idx,topic in enumerate(lda.components_):
    print(idx, " ", topic)

0   [1.4963401  0.50348532 1.49738385 0.50289415 0.50289415 1.49738385
 1.49645968 0.50261262 1.49738385 0.50261262 2.49582873 0.50723709
 1.49738385 0.5041122  1.4963401  1.49645968 2.49582873 0.5041122
 0.5041122  0.50649706 1.49692356 3.54873916 2.49698631 0.50314717
 0.50348532 0.50285897 0.50289415 0.50261262 1.49614865 0.5120184
 1.49692356 0.50290202 1.49692356 5.00524504 1.4963401  0.50354787
 1.49715153 1.4963455  1.49573263 0.5041122  0.50281058 1.49645968
 0.50285897 0.50290202 0.50314717 1.4881754  5.49683498 1.4963401
 0.50348532 1.49726625 0.52050722 3.49676072 0.50348532 1.49614865
 0.5029594  2.49624205 0.50290202 1.49698179 1.49698179 0.50348532
 0.50281058 0.50348532 1.4963455  1.46782851 0.50323246 1.49518586
 0.50500821 3.56418279 1.4963401  1.49715153 1.49518586 0.50348532
 0.50354787 0.50723709 1.49518586 0.50323246 0.50261262 0.52050722
 1.49698179 0.50289415 0.50354787 1.49738385 1.49692356 1.4963455
 1.49614865 0.50314717 1.49738385 0.50285897 0.50290202 1.4973

In [34]:
for idx, topic in enumerate(lda.components_):
    print(f'Topic {idx}')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10-1:-1]])


Topic 0
['learning', 'football', 'world', 'python', 'data', 'machine', 'deep', 'messi', 'barcelona', 'clubs']
Topic 1
['cricket', 'programming', 'popular', 'india', 'pakistan', 'matches', 'development', 'java', 'used', 'exciting']


# let's make it more representative

In [40]:
import pandas as pd

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}

    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx + 1}"] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]

    return topic_dict


In [45]:
no_top_words = 5
topics = display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)
topics

{'Topic 1': ['learning', 'football', 'world', 'python', 'data'],
 'Topic 2': ['cricket', 'programming', 'popular', 'india', 'pakistan']}

In [46]:
#convert the topics dictionary to a dataframe fro better visualization
topics_df = pd.DataFrame(topics)

In [47]:
topics_df

Unnamed: 0,Topic 1,Topic 2
0,learning,cricket
1,football,programming
2,world,popular
3,python,india
4,data,pakistan
