# LDA evaluation notebook

We use an interactive notebook to evaluate our summarization models using our LDA model.

The notebook uses custom-modules defined in other files, but to prevent ourselves from re-loading the data during training, it is easier to use a notebook.

### Resource paths

In [None]:
import os
cwd = os.getcwd()

bart_path = f'{cwd}/bart_output_.json'
t5_path = f'{cwd}/t5_output_.json'
pegasus_path = f'{cwd}/pegasus_output_.json'

model_path = f'{cwd}/model/grid-xxx'
tf_idf_path = f'{cwd}/tf_idf'

### Load pre-computed resources

In [None]:
from gensim.models import TfidfModel
from lda_model import LdaModel

tf_idf = TfidfModel.load(tf_idf_path)
lda = LdaModel.load(model_path)

### Load data

We use our self-made JSON file that stores the original article and abstract (part of the dataset) and the BART model summary

In [None]:
import json
from glob import glob

def read_output(description, data_path, summary_key):
    data = []
    data_files = glob(data_path)
    for file in data_files:
        with open(file) as fin:
            data += json.load(fin)

    return {
        'description': description,
        'articles': [doc['article'] for doc in data],
        'abstracts': [doc['abstract'] for doc in data],
        'summaries': [doc[summary_key] for doc in data]
    }

In [None]:
print('Loading output...')

bart_output = read_output('BART', bart_path, 'summary')
print('BART: ', len(bart_output['articles']))

t5_output = read_output('T5', t5_path, 't5_abstract')
print('T5: ', len(t5_output['articles']))

pegasus_output = read_output('Pegasus', pegasus_path, 'pegasus_abstract')
print('Pegasus: ', len(pegasus_output['articles']))

### Tokenize and pre-process test data

The LDA model expects a BOW input (in our case TF-IDF), not strings. Hence we need to convert each of the texts into the expected format.

In [None]:
from generate_preprocessed import PreProcessor
from generate_bow import BowProcessor
from generate_tf_idf import TfIdfProcessor

pp_processor = PreProcessor()
bow_processor = BowProcessor(lda.dictionary)
tf_idf_processor = TfIdfProcessor(tf_idf)

def process_output(output):
    return {
        'description': output['description'],
        'articles': tf_idf_processor(bow_processor(pp_processor(output['articles']))),
        'abstracts': tf_idf_processor(bow_processor(pp_processor(output['abstracts']))),
        'summaries': tf_idf_processor(bow_processor(pp_processor(output['summaries'])))
    }

In [None]:
bart_tf_idf = process_output(bart_output)
t5_tf_idf = process_output(t5_output)
pegasus_tf_idf = process_output(pegasus_output)

### Evaluate topic retention for each doc and calculate distances

For every original article, we have two gists: one human-made (abstract) and one computer-made (summary).  
We calculate the distance between the two pair (original, abstract) and (original, summary), and examine which one retains topics better.

In [None]:
from lda_eval import LdaEvaluator

evaluator = LdaEvaluator(lda)

In [None]:
from statistics import mean

def lda_compare(collection):
    description = collection['description']
    articles_tf_idf = collection['articles']
    abstracts_tf_idf = collection['abstracts']
    summaries_tf_idf = collection['summaries']

    human_stats = []
    comp_stats = []
    for article, abstract, summary in zip(articles_tf_idf, abstracts_tf_idf, summaries_tf_idf):
        human_stats.append(evaluator(article, abstract))
        comp_stats.append(evaluator(article, summary))

    human_distances = [x['divergence'] for x in human_stats]
    comp_distances = [x['divergence'] for x in comp_stats]

    human_avg = mean(human_distances)
    human_better = sum([h > c for h,c in zip(human_distances, comp_distances)])

    comp_avg = mean(comp_distances)
    comp_better = sum([h < c for h,c in zip(human_distances, comp_distances)])

    print(f'-- {description} ---------------------------------------')
    print(f'Model average: {comp_avg}')
    print(f'Model is better: {comp_better}')
    print(f'Human average: {human_avg}')
    print(f'Human is better: {human_better}')

    return human_stats, comp_stats


In [None]:
lda_compare(bart_tf_idf)
lda_compare(t5_tf_idf)
lda_compare(pegasus_tf_idf)

print('DONE')

### Evaluate ROUGE metric

For every output produced by a model, calculate the industry standard ROUGE metrics.

In [None]:
from rouge import Rouge

rouge = Rouge()

def rouge_compare(input):
    description = input['description']
    abstracts = input['abstracts']
    summaries = input['summaries']

    score = rouge.get_scores(summaries, abstracts, avg=True)

    print(f'-- {description} ---------------------------------------')
    print(json.dumps(score['rouge-1']))
    print(json.dumps(score['rouge-2']))
    print(json.dumps(score['rouge-l']))

In [None]:
rouge_compare(bart_output)
rouge_compare(t5_output)
rouge_compare(pegasus_output)

### Evaluate discrete ROUGE vs. TRRE samples

In [None]:
# Model to examine
model_output = bart_output
model_tf_idf = bart_tf_idf

In [None]:
human_stats, comp_stats = lda_compare(model_tf_idf)
articles, abstracts, summaries = model_output['articles'], model_output['abstracts'], model_output['summaries']

In [None]:
articles_tf_idf = model_tf_idf['articles']
abstracts_tf_idf = model_tf_idf['abstracts']
summaries_tf_idf = model_tf_idf['summaries']

In [None]:
from math import floor

# Estimate what is a "high" divergence, so that we can try and analyze those
div = [abs(h['divergence'] - c['divergence']) for h,c in zip(human_stats, comp_stats)]
high_div = floor(max(div))
high_div

In [None]:
for i, (human, comp, article, abstract, summary) in enumerate(zip(human_stats, comp_stats, articles, abstracts, summaries)):
    article_topics = human['original_topics']

    h_topics = human['summary_topics']
    h_div = human['divergence']

    c_topics = comp['summary_topics']
    c_div = comp['divergence']

    if abs(h_div - c_div) < high_div:
        continue

    _, h_topics_per_word, h_phi = lda.model.get_document_topics(abstracts_tf_idf[i], per_word_topics=True)
    h_topics_per_word = [(x, y[:3]) for x,y in h_topics_per_word]
    h_phi = [(x, sorted(y, reverse=True, key=lambda tup: tup[1])[:3]) for x,y in h_phi]

    _, c_topics_per_word, c_phi = lda.model.get_document_topics(summaries_tf_idf[i], per_word_topics=True)
    c_topics_per_word = [(x, y[:3]) for x,y in c_topics_per_word]
    c_phi = [(x, sorted(y, reverse=True, key=lambda tup: tup[1])[:3]) for x,y in c_phi]

    score = rouge.get_scores(summary, abstract)

    print(f'#{i}')
    print('>>> ROUGE:\n', score[0])
    print('>>> Article topics:\n', article_topics[:3])
    print('>>> Human divergence:\n', h_div)
    print('>>> Human topics:\n', h_topics[:3])
    print('>>> Human per-word topics:\n', h_topics_per_word)
    print('>>> Human phi:\n', h_phi)
    print('>>> Comp divergence:\n', c_div)
    print('>>> Comp topics:\n', c_topics[:3])
    print('>>> Comp per-word topics:\n', c_topics_per_word)
    print('>>> Comp phi:\n', c_phi)
    print('>>> Article:\n', article)
    print('>>> Abstract:\n', abstract)
    print('>>> Summary:\n', summary)

    h_top_topic = h_topics[0][0]
    c_top_topic = c_topics[0][0]
    print('Topic #', h_top_topic, '\n', lda.model.print_topic(h_top_topic))
    if h_top_topic != c_top_topic:
        print('Topic #', c_top_topic, '\n', lda.model.print_topic(c_top_topic))

    print('========================================')