In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [4]:
documents = dataset.data

In [5]:
len(dataset.target_names)

20

In [6]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
len(documents)

11314

So, there are 11314 documents belonging to 20 classes/ newgroups

### Data Preprocessing

In [8]:
news_df = pd.DataFrame({'document': documents})

Remove everything except alphabets

In [9]:
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

Remove short words

In [10]:
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

Convert everything to lowercase

In [11]:
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w.lower() for w in x.split()]))

Remove stopwords

In [12]:
from nltk.corpus import stopwords

In [13]:
stop_words = stopwords.words('english')

Tokenisation

In [14]:
tokenised_doc = news_df['clean_doc'].apply(lambda x: x.split())

In [15]:
tokenised_doc = tokenised_doc.apply(lambda x : [item for item in x if item not in stop_words])

In [16]:
detokenised_doc = []

In [17]:
for i in range(len(news_df)):
    t = ' '.join(words for words in tokenised_doc[i])
    detokenised_doc.append(t)

In [18]:
news_df['clean_doc'] = detokenised_doc

### Document Term Matrix

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(stop_words='english',
                            max_features=1000,
                            max_df=0.5,
                            smooth_idf=True)

In [21]:
X = vectorizer.fit_transform(news_df['clean_doc'])

### Topic Modeling

In [22]:
from sklearn.decomposition import TruncatedSVD

SVD will be used to represent documents and terms as vectors

In [23]:
svdModel = TruncatedSVD(n_components=20,
                        algorithm='randomized',
                        n_iter=100,
                        random_state=1)

In [24]:
svdModel.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=1, tol=0.0)

In [25]:
len(svdModel.components_)

20

### View the most important words in the topics

In [26]:
terms = vectorizer.get_feature_names()

In [27]:
for i, comp in enumerate(svdModel.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])   

Topic 0: 
like
know
people
think
good
time
thanks
Topic 1: 
thanks
windows
card
drive
mail
file
advance
Topic 2: 
game
team
year
games
season
players
good
Topic 3: 
drive
scsi
disk
hard
card
drives
problem
Topic 4: 
windows
file
window
files
program
using
problem
Topic 5: 
government
chip
mail
space
information
encryption
data
Topic 6: 
like
bike
know
chip
sounds
looks
look
Topic 7: 
card
sale
video
offer
monitor
price
jesus
Topic 8: 
know
card
chip
video
government
people
clipper
Topic 9: 
good
know
time
bike
jesus
problem
work
Topic 10: 
think
chip
good
thanks
clipper
need
encryption
Topic 11: 
thanks
right
problem
good
bike
time
window
Topic 12: 
good
people
windows
know
file
sale
files
Topic 13: 
space
think
know
nasa
problem
year
israel
Topic 14: 
space
good
card
people
time
nasa
thanks
Topic 15: 
people
problem
window
time
game
want
bike
Topic 16: 
time
bike
right
windows
file
need
really
Topic 17: 
time
problem
file
think
israel
long
mail
Topic 18: 
file
need
card
files
problem


### Topic Visualization using Uniform Manifold and Appro

In [None]:
import umap

In [None]:
X_topics = svdModel.fit_transform(X)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = dataset.target,
s = 10, # size
edgecolor='none'
)
plt.show()