# Latent Semantic Indexing and Latent Dirichlet Allocation

### Imports

In [None]:
from gensim.models import LsiModel, LdaModel, CoherenceModel, TfidfModel
from gensim.corpora import Dictionary, MmCorpus

from gensim.matutils import kullback_leibler
from gensim import similarities

from collections import defaultdict
from tqdm import tqdm
import numpy as np
import json

import pytrec

import os 
import pickle as pkl
import time

import read_ap
import download_ap
from utils import evaluate

#### Set up directories to store models, corpora and dictionary

In [None]:
save_dir = "LSI_LDA"

if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(os.path.join(save_dir, "models"))
        os.makedirs(os.path.join(save_dir, "corpora"))

## Load ... (Skip if you don't want to load anything)

### Load corpora

You can load you saved models here. Set the path below and run the cells you want to load

In [None]:
# default path (feel free to change)
path_dictionary = save_dir + '/corpora/dictionary.dict'
path_corpus_bow = save_dir + '/corpora/corpus_bow.mm'
path_corpus_tfidf = save_dir + '/corpora/corpus_tfidf.mm'

In [None]:
dictionary = Dictionary.load(path_dictionary)

In [None]:
corpus_bow_mm = MmCorpus(path_corpus_bow)

In [None]:
corpus_tfidf_mm = MmCorpus(path_corpus_tfidf)

### Load models

You can load you saved models here. Set the path below and run the cells you want to load. <br>
*Note that for the LSI tf-idf you also need to load the corresponding tf-idf model. This is needed during evaluation time.*

In [None]:
# default path (feel free to change)
path_lsi_bow = save_dir + '/models/LSI_BOW_model.mm'
path_lsi_tfidf = save_dir + '/models/LSI_tfidf_model.mm'
path_lda_bow = save_dir + '/models/LDA_bow_model.mm'
path_tfidf_model = save_dir + '/models/tfidf_model.mm'

In [None]:
LSI_BOW_model = LsiModel.load(path_lsi_bow)

In [None]:
LSI_TFIDF_model = LsiModel.load(path_lsi_tfidf)

In [None]:
LDA_BOW_model = LdaModel.load(path_lda_bow)

In [None]:
tfidf_model = TfidfModel.load(path_tfidf_model)

## Preprocess data (Run this!)

In [None]:
# load preprocessed data 
download_ap.download_dataset()
docs_by_id = read_ap.get_processed_docs()

## Construct dictionary (Skip if already loaded)

In [None]:
dictionary = Dictionary(docs_by_id.values())

dictionary.filter_extremes(no_below=25, no_above=0.5)

# save dictionary to disk 
dictionary.save(save_dir + '/corpora/dictionary.dict')  

print('#Unique tokens in corpus: %i' % len(dictionary))

## Construct corpora (Skip if already loaded)

For LSI BoW and LDA BoW only run the first cell. For LSI tf-idf run all cells

In [None]:
# construct BOW corpus
corpus_bow = [dictionary.doc2bow(doc) for doc in docs_by_id.values()]

# serialize corpus to disk to prevent memory problems if corpus gets too large
MmCorpus.serialize(save_dir + '/corpora/corpus_bow.mm', corpus_bow)  
corpus_bow_mm = MmCorpus(save_dir + '/corpora/corpus_bow.mm')
    
print('#Documents in BOW corpus: %i' % len(corpus_bow))

In [None]:
# construct tfidf corpus
tfidf_model = TfidfModel(corpus_bow_mm)
corpus_tfidf = tfidf_model[corpus_bow_mm]

# serialize corpus to disk to prevent memory problems if corpus gets too large
MmCorpus.serialize(save_dir + '/corpora/corpus_tfidf.mm', corpus_tfidf)  
corpus_tfidf_mm = MmCorpus(save_dir + '/corpora/corpus_tfidf.mm')

In [None]:
tfidf_model.save(save_dir + '/models/tfidf_model.mm')

## Train models

Set the number of topics you want to train on 

In [None]:
# params 
num_topics = 500

### LSI BoW (Skip if already loaded)

In [None]:
tic = time.perf_counter()

LSI_BOW_model = LsiModel(corpus_bow_mm, id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LSI BOW in {toc - tic:0.4f} seconds")

In [None]:
LSI_BOW_model.save(save_dir + '/models/LSI_BOW_model.mm')

### LSI tf-idf (Skip if already loaded)

In [None]:
tic = time.perf_counter()

LSI_tfidf_model = LsiModel(corpus_tfidf_mm, id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LSI tf-idf in {toc - tic:0.4f} seconds")

In [None]:
LSI_tfidf_model.save(save_dir + '/models/LSI_tfidf_model.mm')

### LDA BoW (Skip if already loaded)

In [None]:
tic = time.perf_counter()

LDA_BOW_model = LdaModel(corpus=corpus_tfidf_mm,id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LDA BoW in {toc - tic:0.4f} seconds")

In [None]:
LDA_BOW_model.save(save_dir + '/models/LDA_bow_model.mm')

## Show topics 

Only run the cells that apply to the model(s) you are evaluating

In [None]:
# LSI BoW
LSI_BOW_model.print_topics(num_topics=5, num_words=20)

In [None]:
# LSI tf-idf
LSI_tfidf_model.print_topics(num_topics=5, num_words=20)

In [None]:
# LDA BoW
LDA_BOW_model.print_topics(num_topics=5, num_words=20)

## Retrieval and Evaluation

In [None]:
def run_evaluation(model, corpus, doc_ids, tfidf):
    corpus_modelspace = model[corpus]
    index = similarities.MatrixSimilarity(corpus_modelspace, dtype=float)  # ~3min
    metrics = evaluate_queries(model, doc_ids, dictionary, corpus_modelspace, tfidf, index)

    map_all = np.average([m['map'] for m in metrics.values()])
    ndcg_all = np.average([m['ndcg'] for m in metrics.values()])

    map_val = np.average([m['map'] for did, m in metrics.items() if int(did) in range(76, 101)])
    ndcg_val = np.average([m['ndcg'] for did, m in metrics.items() if int(did) in range(76, 101)])

    print((map_all, ndcg_all), (map_val, ndcg_val))

    return metrics

In [None]:
def evaluate_queries(model, doc_ids, dictionary, corpus_modelspace, tfidf, index, save_path='LSI_LDA'):
    qrels, queries = read_ap.read_qrels()

    overall_result = {}

    for query_id, query in tqdm(queries.items()):
        results = rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=tfidf, index=index)
        overall_result[query_id] = dict(results)

        if int(query_id) not in np.arange(76, 101):
            evaluate.write_trec_results(query_id, results, save_path)

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_result)

    return metrics

In [None]:
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None):
    query_prepro = read_ap.process_text(query)

    # transform query to bow vector space
    q_cspace = dictionary.doc2bow(query_prepro)

    if not tfidf_model == None:
        # transform query to tfidf vector space
        q_cspace = tfidf_model[q_cspace]

    q_modelspace = model[q_cspace]
    
    if isinstance(model, LsiModel):
        ## LSI
        scores = index[q_modelspace]

        results = defaultdict(float)
        for doc_id, score in zip(doc_ids, scores):
          results[doc_id] = score

        results = list(results.items())
        results.sort(key=lambda _: -_[1])

    elif isinstance(model, LdaModel):
        ## LDA
        doc_ids = list(doc_ids)
        scores = []
        # have to use the for loop, otherwise kullback_leibler has problems
        for d in corpus_modelspace:
            scores.append(float(-kullback_leibler(q_modelspace, d)))

        # have to use torch here to do this more efficiently
        order = torch.Tensor(scores).argsort(descending=True).numpy()
        ordered_results = [(doc_ids[i], scores[i]) for i in order]
        results = dict(ordered_results)

    return results

The cells below evaluate the models using MAP and nDCG as a metric. Only run the cells that apply to the models you want to evaluate

In [None]:
# LSI BoW
run_evaluation(LSI_BOW_model, corpus_bow_mm, docs_by_id, tfidf=None)

In [None]:
# LSI tf-idf
run_evaluation(LSI_tfidf_model, corpus_tfidf_mm, docs_by_id, tfidf=tfidf_model)

In [None]:
# LDA BoW
run_evaluation(LDA_BOW_model, corpus_bow_mm, docs_by_id, tfidf=None)

## Grid Search

In [None]:
def train_LSI(corpus, name, num_topics=500):
    tic = time.perf_counter()

    LSI_model = LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

    toc = time.perf_counter()
    print(f"Trained LSI {name} in {toc - tic:0.4f} seconds")  # ~4min

    LSI_model.save(f'/LSI_{name}_model_{num_topics}.mm')

    return LSI_model

In [None]:
def grid_search_lsi(corpus, tfidf, name):

    assert name in ['bow', 'tfidf']

    for num_topics in [10, 50, 100, 500, 1000, 2000]:

        print("--training")
        lsi_model = train_LSI(corpus, name, num_topics=num_topics)

        print("--evaluating")
        # Run this if you want to evaluate LSI tfidf model
        lsi_metrics = run_evaluation(model=lsi_model,
                                     corpus=corpus,
                                     doc_ids=docs_by_id.keys(),
                                     tfidf=tfidf)

        with open(f'LSI_{name}_{num_topics}', "w") as writer:
            json.dump(lsi_metrics, writer, indent=1)

These cells perform the grid search on the number of topics for LSI BoW and LSI Tf-idf. Only run the cells that apply to the models you want to perform the grid search on.

In [None]:
# Grid search LSI BoW
grid_search_lsi(corpus_bow_mm, tfidf=None, name="bow")

In [None]:
# Grid search LSI TF-IDF
grid_search_lsi(corpus_tfidf_mm, tfidf=tfidf_model, name="tfidf")