# Latent Semantic Indexing and Latent Dirichlet Allocation

### Imports

In [None]:
import sys
sys.path.append('C:/Users/vanes/Miniconda3/envs/ir1-hw2/Lib/site-packages')

from gensim.models import LsiModel, LdaModel, CoherenceModel, TfidfModel
from gensim.corpora import Dictionary, MmCorpus

from gensim.matutils import kullback_leibler
from gensim import similarities

from collections import defaultdict
from tqdm import tqdm
import numpy as np

# import pytrec

import os 
import pickle as pkl
import time

import read_ap
import download_ap
# from utils import evaluate

#### Set up directories to store models, corpora and dictionary

In [21]:
save_dir = "LSI_LDA"

if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(os.path.join(save_dir, "models"))
        os.makedirs(os.path.join(save_dir, "corpora"))

## Load ... (Skip if you don't want to load anything)

### Load corpora

You can load you saved models here. Set the path below and run the cells you want to load

In [22]:
# default path (feel free to change)
path_dictionary = save_dir + '/corpora/dictionary.dict'
path_corpus_bow = save_dir + '/corpora/corpus_bow.mm'
path_corpus_tfidf = save_dir + '/corpora/corpus_tfidf.mm'

In [23]:
dictionary = Dictionary.load(path_dictionary)

In [24]:
corpus_bow_mm = MmCorpus(path_corpus_bow)

In [25]:
corpus_tfidf_mm = MmCorpus(path_corpus_tfidf)

### Load models

You can load you saved models here. Set the path below and run the cells you want to load. <br>
*Note that for the LSI tf-idf you also need to load the corresponding tf-idf model. This is needed during evaluation time.*

In [26]:
# default path (feel free to change)
path_lsi_bow = save_dir + '/models/LSI_BOW_model.mm'
path_lsi_tfidf = save_dir + '/models/LSI_tfidf_model.mm'
path_lda_bow = save_dir + '/models/LDA_bow_model.mm'
path_tfidf_model = save_dir + '/models/tfidf_model.mm'

In [27]:
lsi_bow_model = LsiModel.load(path_lsi_bow)

In [28]:
lsi_tfidf_model = LsiModel.load(path_lsi_tfidf)

In [29]:
lda_bow_model = LdaModel.load(path_lda_bow)

In [30]:
tfidf_model = TfidfModel.load(path_tfidf_model)

## Preprocess data (Run this!)

In [None]:
# load preprocessed data 
download_ap.download_dataset()
docs_by_id = read_ap.get_processed_docs()

## subset (REMOVE THIS BEFORE HANDING IN)

In [None]:
docs_by_id = dict(list(docs_by_id.items())[:10000])

## Construct dictionary (Skip if already loaded)

In [None]:
dictionary = Dictionary(docs_by_id.values())

dictionary.filter_extremes(no_below=25, no_above=0.5)

# save dictionary to disk 
dictionary.save(save_dir + '/corpora/dictionary.dict')  

print('#Unique tokens in corpus: %i' % len(dictionary))

## Construct corpora (Skip if already loaded)

For LSI BoW and LDA BoW only run the first cell. For LSI tf-idf run all cells

In [None]:
# construct BOW corpus
corpus_bow = [dictionary.doc2bow(doc) for doc in docs_by_id.values()]

# serialize corpus to disk to prevent memory problems if corpus gets too large
MmCorpus.serialize(save_dir + '/corpora/corpus_bow.mm', corpus_bow)  
corpus_bow_mm = MmCorpus(save_dir + '/corpora/corpus_bow.mm')
    
print('#Documents in BOW corpus: %i' % len(corpus_bow))

In [None]:
# construct tfidf corpus
tfidf_model = TfidfModel(corpus_bow_mm)
corpus_tfidf = tfidf_model[corpus_bow_mm]

# serialize corpus to disk to prevent memory problems if corpus gets too large
MmCorpus.serialize(save_dir + '/corpora/corpus_tfidf.mm', corpus_tfidf)  
corpus_tfidf_mm = MmCorpus(save_dir + '/corpora/corpus_tfidf.mm')

In [None]:
tfidf_model.save(save_dir + '/models/tfidf_model.mm')

## Train models

Set the number of topics you want to train on 

In [11]:
# params 
num_topics = 500

### LSI BoW (Skip if already loaded)

In [12]:
tic = time.perf_counter()

LSI_BOW_model = LsiModel(corpus_bow_mm, id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LSI BOW in {toc - tic:0.4f} seconds")

Trained LSI BOW in 7.2385 seconds


In [13]:
LSI_BOW_model.save(save_dir + '/models/LSI_BOW_model.mm')

### LSI tf-idf (Skip if already loaded)

In [14]:
tic = time.perf_counter()

LSI_tfidf_model = LsiModel(corpus_tfidf_mm, id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LSI tf-idf in {toc - tic:0.4f} seconds")

Trained LSI tf-idf in 7.4356 seconds


In [15]:
LSI_tfidf_model.save(save_dir + '/models/LSI_tfidf_model.mm')

### LDA BoW (Skip if already loaded)

In [16]:
tic = time.perf_counter()

LDA_BOW_model = LdaModel(corpus=corpus_tfidf_mm,id2word=dictionary, num_topics=num_topics)

toc = time.perf_counter() 
print(f"Trained LDA BoW in {toc - tic:0.4f} seconds")

Trained LDA tf-idf in 33.4586 seconds


In [17]:
LDA_BOW_model.save(save_dir + '/models/LDA_bow_model.mm')

## Show topics 

Only run the cells that apply to the model(s) you are evaluating

In [18]:
# LSI BoW
LSI_BOW_model.print_topics(num_topics=5, num_words=20)

[(0,
  '-0.513*"percent" + -0.224*"12" + -0.186*"state" + -0.146*"new" + -0.136*"report" + -0.112*"10" + -0.102*"presid" + -0.101*"n\'t" + -0.100*"bush" + -0.100*"say" + -0.097*"one" + -0.096*"democrat" + -0.093*"nation" + -0.092*"last" + -0.091*"million" + -0.090*"also" + -0.090*"govern" + -0.088*"peopl" + -0.088*"u.s." + -0.086*"two"'),
 (1,
  '0.642*"percent" + 0.414*"12" + -0.135*"state" + 0.108*"10" + -0.102*"new" + -0.089*"say" + -0.086*"one" + -0.086*"govern" + -0.085*"n\'t" + -0.084*"presid" + -0.083*"u.s." + -0.080*"peopl" + -0.078*"two" + -0.076*"offici" + -0.075*"also" + 0.074*"precinct" + -0.072*"nation" + -0.066*"last" + -0.066*"million" + -0.063*"unit"'),
 (2,
  '0.896*"10" + -0.177*"12" + -0.168*"percent" + 0.107*"stake" + 0.105*"total" + 0.103*"26" + 0.098*"dukaki" + 0.093*"gephardt" + 0.089*"jackson" + 0.088*"gore" + 0.084*"precinct" + 0.078*"simon" + 0.067*"report" + 0.067*"hart" + 0.067*"babbitt" + 0.062*"uncommit" + 0.054*"bush" + 0.047*"dole" + 0.041*"democrat" + 0

In [19]:
# LSI tf-idf
LSI_tfidf_model.print_topics(num_topics=5, num_words=20)

[(0,
  '0.138*"percent" + 0.108*"bush" + 0.092*"democrat" + 0.086*"dole" + 0.086*"dukaki" + 0.085*"state" + 0.083*"campaign" + 0.079*"stock" + 0.077*"vote" + 0.075*"million" + 0.072*"deleg" + 0.072*"soviet" + 0.072*"jackson" + 0.071*"govern" + 0.071*"polic" + 0.069*"presid" + 0.068*"compani" + 0.067*"u.s." + 0.067*"primari" + 0.067*"gephardt"'),
 (1,
  '-0.288*"bush" + -0.235*"dole" + -0.226*"dukaki" + -0.187*"deleg" + -0.182*"democrat" + -0.172*"jackson" + -0.168*"gore" + -0.164*"gephardt" + -0.164*"primari" + -0.157*"campaign" + -0.156*"robertson" + 0.154*"stock" + -0.126*"republican" + -0.126*"vote" + -0.115*"simon" + -0.112*"candid" + -0.109*"percent" + -0.106*"super" + -0.099*"illinoi" + 0.098*"market"'),
 (2,
  '-0.314*"stock" + -0.196*"percent" + -0.192*"index" + -0.187*"market" + -0.155*"trade" + -0.141*"price" + -0.140*"cent" + -0.129*"share" + -0.126*"rose" + -0.121*"exchang" + -0.113*"averag" + -0.107*"dow" + 0.105*"polic" + -0.096*"billion" + -0.089*"jone" + -0.089*"point" 

In [20]:
# LDA BoW
LDA_BOW_model.print_topics(num_topics=5, num_words=20)

[(174,
  '0.000*"imbal" + 0.000*"mute" + 0.000*"lent" + 0.000*"exchange-list" + 0.000*"outnumb" + 0.000*"nyse" + 0.000*"humphrey" + 0.000*"10.5" + 0.000*"kan." + 0.000*"toyota" + 0.000*"usa" + 0.000*"0.1" + 0.000*"0.2" + 0.000*"0.3" + 0.000*"0.4" + 0.000*"0.8" + 0.000*"1.4" + 0.000*"dynam" + 0.000*"2.2" + 0.000*"volum"'),
 (329,
  '0.000*"imbal" + 0.000*"mute" + 0.000*"lent" + 0.000*"exchange-list" + 0.000*"outnumb" + 0.000*"nyse" + 0.000*"humphrey" + 0.000*"10.5" + 0.000*"kan." + 0.000*"toyota" + 0.000*"usa" + 0.000*"0.1" + 0.000*"0.2" + 0.000*"0.3" + 0.000*"0.4" + 0.000*"0.8" + 0.000*"1.4" + 0.000*"dynam" + 0.000*"2.2" + 0.000*"volum"'),
 (444,
  '0.000*"imbal" + 0.000*"mute" + 0.000*"lent" + 0.000*"exchange-list" + 0.000*"outnumb" + 0.000*"nyse" + 0.000*"humphrey" + 0.000*"10.5" + 0.000*"kan." + 0.000*"toyota" + 0.000*"usa" + 0.000*"0.1" + 0.000*"0.2" + 0.000*"0.3" + 0.000*"0.4" + 0.000*"0.8" + 0.000*"1.4" + 0.000*"dynam" + 0.000*"2.2" + 0.000*"volum"'),
 (152,
  '0.000*"imbal" + 0.

## Retrieval and Evaluation

In [None]:
def run_evaluation(model, corpus, doc_ids, tfidf):
    corpus_modelspace = model[corpus]
    index = similarities.MatrixSimilarity(corpus_modelspace, dtype=float)  # ~3min
    metrics = evaluate_queries(model, doc_ids, dictionary, corpus_modelspace, tfidf, index)

    map_all = np.average([m['map'] for m in metrics.values()])
    ndcg_all = np.average([m['ndcg'] for m in metrics.values()])

    map_val = np.average([m['map'] for did, m in metrics.items() if int(did) in range(76, 101)])
    ndcg_val = np.average([m['ndcg'] for did, m in metrics.items() if int(did) in range(76, 101)])

    print((map_all, ndcg_all), (map_val, ndcg_val))

    return metrics

In [None]:
def evaluate_queries(model, doc_ids, dictionary, corpus_modelspace, tfidf, index, save_path='LSI'):
    qrels, queries = read_ap.read_qrels()

    overall_result = {}

    for query_id, query in tqdm(queries.items()):
        results = rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=tfidf, index=index)
        overall_result[query_id] = dict(results)

        if int(query_id) not in np.arange(76, 101):
            evaluate.write_trec_results(query_id, results, save_path)

    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_result)

    return metrics

In [None]:
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None):
    query_prepro = read_ap.process_text(query)

    # transform query to bow vector space
    q_cspace = dictionary.doc2bow(query_prepro)

    if not tfidf_model == None:
        # transform query to tfidf vector space
        q_cspace = tfidf_model[q_cspace]

    q_modelspace = model[q_cspace]
    
    if isinstance(model, LsiModel):
        ## LSI
        scores = index[q_modelspace]

        results = defaultdict(float)
        for doc_id, score in zip(doc_ids, scores):
          results[doc_id] = score

        results = list(results.items())
        results.sort(key=lambda _: -_[1])

    elif isinstance(model, LdaModel):
        ## LDA
        doc_ids = list(doc_ids)
        scores = []
        # have to use the for loop, otherwise kullback_leibler has problems
        for d in corpus_modelspace:
            scores.append(float(-kullback_leibler(q_modelspace, d)))

        # have to use torch here to do this more efficiently
        order = torch.Tensor(scores).argsort(descending=True).numpy()
        ordered_results = [(doc_ids[i], scores[i]) for i in order]
        results = dict(ordered_results)

    return results

The cells below evaluate the models using MAP and nDCG as a metric. Only run the cells that apply to the models you want to evaluate

In [None]:
# LSI BoW
run_evaluation(LSI_BOW_model, corpus_bow_mm, docs_by_id, tfidf=None)

In [None]:
# LSI tf-idf
run_evaluation(LSI_tfidf_model, corpus_tfidf_mm, docs_by_id, tfidf=tfidf_model)

In [None]:
# LDA BoW
run_evaluation(LDA_BOW_model, corpus_bow_mm, docs_by_id, tfidf=None)

## Grid Search

In [None]:
def train_LSI(corpus, name, num_topics=500):
    tic = time.perf_counter()

    LSI_model = LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

    toc = time.perf_counter()
    print(f"Trained LSI {name} in {toc - tic:0.4f} seconds")  # ~4min

    LSI_model.save(f'/LSI_{name}_model_{num_topics}.mm')

    return LSI_model

In [None]:
def grid_search_lsi(corpus, tfidf, name):

    assert name in ['bow', 'tfidf']

    for num_topics in [10, 50, 100, 500, 1000, 2000]:

        print("--training")
        lsi_model = train_LSI(corpus, name, num_topics=num_topics)

        print("--evaluating")
        # Run this if you want to evaluate LSI tfidf model
        lsi_metrics = run_evaluation(model=lsi_model,
                                     corpus=corpus,
                                     doc_ids=docs_by_id.keys(),
                                     tfidf=tfidf)

        with open(f'LSI_{name}_{num_topics}', "w") as writer:
            json.dump(lsi_metrics, writer, indent=1)