# PLSA
probabilistic latent semantic analysis

In [10]:
import sys
import matplotlib.pyplot as plt
from plsa import Corpus, Pipeline, Visualize
from plsa.pipeline import DEFAULT_PIPELINE
from plsa.algorithms import PLSA

In [11]:
csv_file = 'datasets/20news.csv'
directory = 'datasets'

In [12]:
import pandas as pd
pd.read_csv(csv_file)["content"].to_csv("datasets/news_20_cleaned.csv")

In [13]:
csv_file = 'datasets/news_20_cleaned.csv'

In [14]:
pipeline = Pipeline(*DEFAULT_PIPELINE)
pipeline

Pipeline:
0: remove_non_ascii
1: to_lower
2: remove_numbers
3: tag_remover
4: punctuation_remover
5: tokenize
6: LemmatizeWords
7: RemoveStopwords
8: short_word_remover

In [15]:
corpus = Corpus.from_csv(csv_file, pipeline)
corpus

Corpus:
Number of documents: 971
Number of words:     9865

In [16]:
n_topics = 7
plsa = PLSA(corpus, n_topics, True)
#
result = plsa.fit()


array([0.18460655, 0.15286448, 0.15064762, 0.14235353, 0.13374865,
       0.12660758, 0.1091716 ])

<b>Find the best PLSA model of many</b> <br>
As with any iterative algorithm, also the probabilities in PSLA need to be (randomly) initialized prior to the first iteration step. Therefore, calling the fit method of two different PLSA instances operating on the same corpus with the same number of topics potentially leads to (slightly) different results, corresponding to different local minima of the Kullback-Leibler divergence between the true document-word probability and its approximate factorization. To mitigate this effect, perform multiple runs and pick the best model.

In [18]:
result = plsa.best_of(5) # Do a 5 different runs and pick a best of 5
result.topic

array([0.18472799, 0.15843785, 0.1475734 , 0.13900792, 0.13825103,
       0.11663721, 0.1153646 ])

In [19]:
import pickle
with open("trained_models/plsa_trained_news20_7.pkl","wb") as f:
    pickle.dump(result,f)

KeyError: '__getstate__'

In [20]:
new_doc = 'Hello! This is the federal humpty dumpty agency for state funding.'

topic_components, number_of_new_words, new_words = result.predict(new_doc)

print('Relative topic importance in new document:', topic_components)
print('Number of previously unseen words in new document:', number_of_new_words)
print('Previously unseen words in new document:', new_words)

Relative topic importance in new document: [0.32763389 0.070425   0.1115168  0.35108152 0.09038121 0.00567461
 0.04328697]
Number of previously unseen words in new document: 3
Previously unseen words in new document: ('humpty', 'dumpty', 'funding')


# getting topic distribution for News20

In [21]:
news_20=pd.read_csv("datasets/news_20_cleaned.csv")["content"]

In [85]:
col_names=["index"]+[f"topic_{i}" for i in range(n_topics)]
plsa_distribution=pd.DataFrame(columns=col_names)
missing_idx=[]
for news,idx,append_pointer in zip(news_20,news_20.index.to_list(),range(len(news_20))):
    try:
        op=[idx]+list(result.predict(news)[0])
        plsa_distribution.loc[append_pointer]=op
    except:
        missing_idx.append(idx)    
# plsa_distribution.loc[0]=[1,2,23,4,5,6,7,8]
plsa_distribution

Unnamed: 0,index,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,0.0,0.167606,0.125942,0.293242,0.245064,0.072665,0.041921,0.053560
1,1.0,0.277264,0.055091,0.089403,0.127531,0.113333,0.314278,0.023100
2,2.0,0.534913,0.073154,0.097343,0.133144,0.066949,0.051206,0.043290
3,3.0,0.494809,0.043122,0.114357,0.204409,0.034848,0.053847,0.054608
4,4.0,0.136714,0.550428,0.058364,0.073511,0.106030,0.055934,0.019018
...,...,...,...,...,...,...,...,...
18841,18841.0,0.260232,0.147339,0.055843,0.222390,0.171843,0.094712,0.047640
18842,18842.0,0.421209,0.056763,0.113550,0.210476,0.119379,0.022824,0.055800
18843,18843.0,0.213742,0.294760,0.117598,0.139233,0.126057,0.064816,0.043793
18844,18844.0,0.422876,0.059320,0.097315,0.309534,0.027330,0.038007,0.045618


In [86]:
len(missing_idx)

700

In [87]:
plsa_distribution.iloc[12]

index      13.000000
topic_0     0.240907
topic_1     0.069340
topic_2     0.048070
topic_3     0.069163
topic_4     0.505741
topic_5     0.033697
topic_6     0.033083
Name: 13, dtype: float64

In [25]:
plsa_distribution.to_csv("plsa_outputs/news_20/7_topics.csv")