## Usage example

Here is an example of a LSA pipeline that:
1. Ingests a collection of texts
2. Makes the corresponding document-term matrix using stemming and removing stop words
3. Extracts 40 topics
4. Shows a table with the extracted topics
5. Shows a table with statistical thesaurus entries for selected words  

In [15]:
import random
from LatentSemanticAnalyzer.LatentSemanticAnalyzer import *
from LatentSemanticAnalyzer.DataLoaders import *
from OutlierIdentifiers import *
import snowballstemmer

In [2]:
# Collection of texts
dfAbstracts = load_abstracts_data_frame()
docs = dict(zip(dfAbstracts.ID, dfAbstracts.Abstract))
len(docs)

578

In [3]:
# Stemmer object (to preprocess words in the pipeline below)
stemmerObj = snowballstemmer.stemmer("english")

In [4]:
# Words to show statistical thesaurus entries for
words = ["notebook", "computational", "function", "neural", "talk", "programming"]

In [5]:
# Reproducible results
random.seed(12)

In [6]:
# Remove non-strings
docs2 = { k:v for k, v in docs.items() if isinstance(v, str) }
len(docs2)

567

In [7]:
# LSA pipeline
lsaObj = (LatentSemanticAnalyzer()
          .make_document_term_matrix(docs=docs2,
                                     stop_words=True,
                                     stemming_rules=True,
                                     min_length=3)
          .apply_term_weight_functions(global_weight_func="IDF",
                                       local_weight_func="None",
                                       normalizer_func="Cosine")
          .extract_topics(number_of_topics=40, min_number_of_documents_per_term=10, method="NNMF")
          .echo_topics_interpretation(number_of_terms=12, wide_form=True)
          .echo_statistical_thesaurus(terms=stemmerObj.stemWords(words),
                                      wide_form=True,
                                      number_of_nearest_neighbors=12,
                                      method="cosine",
                                      echo_function=lambda x: print(x.to_string())))

                                            0           1           2          3         4           5           6            7         8          9         10         11
tpc.000.new-featur-graphic                 new      featur     graphic      cover   complex        look     version         plot      will    present    improv   function
tpc.001.scienc-data-life                scienc        data        life      engin    social     organiz      analyt         year   sophist   workflow     field       good
tpc.002.visual-look-complex             visual        look     complex   scientif      plot        main        best        creat      valu      world   display       make
tpc.003.analyt-inform-data              analyt      inform        data    dataset   patient     extract       queri        manag    health    analysi    compar      engag
tpc.004.rule-space-defin                  rule       space       defin     explor   cluster      design  particular      contain     order      c



---

## Find outliers in the topics interpretation data frame

In [41]:
dfTopicsLongForm = lsaObj.get_topics_interpretation(number_of_terms=120, as_data_frame=True, wide_form=False, echo=False).take_value()
dfTopicsLongForm

Unnamed: 0,Topic,Term,Score
0,tpc.000.new-featur-graphic,new,0.267317
1,tpc.000.new-featur-graphic,featur,0.233532
2,tpc.000.new-featur-graphic,graphic,0.151198
3,tpc.000.new-featur-graphic,cover,0.142437
4,tpc.000.new-featur-graphic,complex,0.137673
...,...,...,...
115,tpc.039.cours-grade-assess,context,0.014703
116,tpc.039.cours-grade-assess,physic,0.014659
117,tpc.039.cours-grade-assess,second,0.014506
118,tpc.039.cours-grade-assess,program,0.014430


In [42]:
# Group by "Topic" and select rows where Score is an outlier according to your function
dfTopicsOfOutliersLongForm = (
    dfTopicsLongForm.groupby("Topic", group_keys=False)
    .apply(lambda g: g[outlier_identifier(g["Score"].tolist(), identifier = lambda v: top_outliers(hampel_identifier_parameters(v)))])
)

  .apply(lambda g: g[outlier_identifier(g["Score"].tolist(), identifier = lambda v: top_outliers(hampel_identifier_parameters(v)))])


In [43]:
# Optional: reset index for a clean DataFrame
dfTopicsOfOutliersLongForm = dfTopicsOfOutliersLongForm.reset_index(drop=True)

In [44]:
dfTopicsOfOutliersLongForm

Unnamed: 0,Topic,Term,Score
0,tpc.000.new-featur-graphic,new,0.267317
1,tpc.000.new-featur-graphic,featur,0.233532
2,tpc.000.new-featur-graphic,graphic,0.151198
3,tpc.000.new-featur-graphic,cover,0.142437
4,tpc.000.new-featur-graphic,complex,0.137673
...,...,...,...
1354,tpc.039.cours-grade-assess,classroom,0.083246
1355,tpc.039.cours-grade-assess,showcas,0.077567
1356,tpc.039.cours-grade-assess,mathematica,0.077406
1357,tpc.039.cours-grade-assess,abl,0.076793


In [45]:
dfTopicsOfOutliersLongForm.groupby("Topic").apply(lambda x: len(x))

  dfTopicsOfOutliersLongForm.groupby("Topic").apply(lambda x: len(x))


Topic
tpc.000.new-featur-graphic             37
tpc.001.scienc-data-life               32
tpc.002.visual-look-complex            36
tpc.003.analyt-inform-data             35
tpc.004.rule-space-defin               30
tpc.005.model-system-engin             32
tpc.006.softwar-develop-cloud          26
tpc.007.optim-solver-problem           35
tpc.008.alpha-introduct-notebook       35
tpc.009.player-app-cdf                 28
tpc.010.imag-process-classif           31
tpc.011.link-librari-excel             25
tpc.012.student-school-materi          30
tpc.013.math-scienc-interpret          44
tpc.014.latest-gain-insight            39
tpc.015.financi-data-market            36
tpc.016.precis-numer-method            38
tpc.017.geograph-coordin-way           37
tpc.018.educ-classroom-problem         29
tpc.019.time-seri-date                 29
tpc.020.project-review-deploy          25
tpc.021.dynam-interfac-manipul         34
tpc.022.network-neural-train           37
tpc.023.geometr-step-region 