#####  Topic Modeling 

In [None]:
import pandas as pd
import numpy as np

In [None]:
documents_df = pd.read_csv('sample_news1.csv',names=['text'],encoding="latin-1" )

In [None]:
pd.set_option("display.max_colwidth",200)

In [None]:
documents_df

In [None]:
documents_df.shape

In [None]:
documents = documents_df['text'].tolist()

In [None]:
documents[0:3]

In [None]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')  

stoplist

In [None]:
documents

In [None]:
texts = [[word for word in record.lower().split() if word not in stoplist]
         for record in documents]

texts

In [None]:
# remove junk words using regex
import re

texts = [[word for word in document if re.match('[a-zA-Z][a-zA-Z]{2,}', word)]
         for document in texts]

texts

In [None]:
documents[0:3]

####  SKlearn based BOW and LSI 

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(min_df=2, max_df=.95, stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z][a-zA-Z]{2,}')

x_bow = vectorizer.fit_transform(documents)
 

In [None]:
vectorizer.get_feature_names()

In [None]:
bow_tf_df = pd.DataFrame(x_bow.toarray(), columns=vectorizer.get_feature_names())
print(bow_tf_df.shape)
bow_tf_df

In [None]:
from sklearn.decomposition import TruncatedSVD
lsi_model = TruncatedSVD(n_components=2)
lsi_transf = lsi_model.fit_transform(x_bow)
print(lsi_transf.shape)

In [None]:
lsi_transf

In [None]:
lsi_model.components_  

In [None]:
def print_topics(model, vectorizer, top_n=5):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        
print("LSI Model:")
print_topics(lsi_model, vectorizer)

##### LDA Model Building ( Latent Dirichlet Allocation, LDA)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda_model = LatentDirichletAllocation(n_components=2)
lda_transf = lda_model.fit_transform(x_bow)
print(lda_transf.shape)

In [None]:
def print_topics(model, vectorizer, top_n=5):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        
print("LDA Model:")
print_topics(lda_model, vectorizer)

##### Transform new document into the LDA and LSI Topics

In [None]:
new_doc = ["Data Scientist is the hottest profession of 2019 according to job listing data",
           "PH volleyball sport chief confident of national team’s medal chances in 2019 SEA Games"]

In [None]:
new_doc_bow = vectorizer.transform(new_doc)

In [None]:
lda_model.transform(new_doc_bow) 