# Week 7 - Summarizing Text
## Keyphrase Extraction - Starting on Page 352
This code just follows the text, with corrections from the book where necessary.

In [3]:
# There are several ways to get folders visible in Python. This way isn't the most elegant
# but it works consistently. Replace my path with yours. The path you append to should be the
# folder where your tokenizer Python class is located.
import sys
sys.path.append(r'C:\Users\neugg\OneDrive\Documents\GitHub\dsc360-instructor\12 Week\week_4\assignment')
from text_normalizer import TextNormalizer
from nltk.corpus import gutenberg
import nltk
import pandas as pd
from operator import itemgetter

tn = TextNormalizer()

# my code is a bit different than the author's but works with our
# TextNormalizer.
alice_txt = gutenberg.sents(fileids='carroll-alice.txt')
alice_list = list([' '.join(ts) for ts in alice_txt])
alice = pd.Series(alice_list)
norm_alice = tn.normalize_corpus(corpus=alice, text_lemmatization=False)

# print first line
print('\nAlice - before and after')
print(alice[0], '\n', norm_alice[0], '\n')

# page 353
def flatten_corpus(corpus):
    return ' '.join([document.strip()
                     for document in corpus])

# page 352
def compute_ngrams(sequence, n):
    return zip(*[sequence[index:]
                 for index in range(n)])

# page 353
def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)

    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq)
                     for text, freq in sorted_ngrams]

    return sorted_ngrams

# page 353
print('Bigrams:\n', get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10), '\n')

# page 354
print('Trigrams:\n', get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10))

# page 355
print('Collocation Finder:\n')
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures # updated package

finder = BigramCollocationFinder.from_documents([item.split()
                                                for item
                                                in norm_alice])
bigram_measures = BigramAssocMeasures()
print('Bigram Association Measures:')
print(finder.nbest(bigram_measures.raw_freq, 10))
print(finder.nbest(bigram_measures.pmi, 10), '\n')

# page 356
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures # updated package

finder = TrigramCollocationFinder.from_documents([item.split()
                                                for item
                                                in norm_alice])
trigram_measures = TrigramAssocMeasures()
print('Trigram Association Measures:')
print(finder.nbest(trigram_measures.raw_freq, 10))
print(finder.nbest(trigram_measures.pmi, 10), '\n')

# page 357
sentences = """
Elephants are large mammals of the family Elephantidae 
and the order Proboscidea. Two species are traditionally recognised, 
the African elephant and the Asian elephant. Elephants are scattered 
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male 
African elephants are the largest extant terrestrial animals. All 
elephants have a long trunk used for many purposes, 
particularly breathing, lifting water and grasping objects. Their 
incisors grow into tusks, which can serve as weapons and as tools 
for moving objects and digging. Elephants' large ear flaps help 
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
and concave backs while Asian elephants have smaller ears 
and convex or level backs.  
"""
sent_tokens = nltk.sent_tokenize(sentences)
print('Sentence tokenization:')
print(len(sent_tokens))
print(sent_tokens[:3], '\n')

print('Normalize text:')
sentences_series = pd.Series(sent_tokens)
norm_sentences = tn.normalize_corpus(corpus=sentences_series, text_lower_case=False,
                                     text_lemmatization=False, stopword_removal=False)
print(norm_sentences[:3], '\n')

# starting on page 358
import itertools
stopwords = nltk.corpus.stopwords.words('english')

def get_chunks(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
               stopword_list=stopwords):
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)

    for sentence in sentences:
        tagged_sents = nltk.pos_tag_sents(
            [nltk.word_tokenize(sentence)])

        chunks = [chunker.parse(tagged_sent)
                  for tagged_sent in tagged_sents]

        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                     for chunk in chunks]

        flattened_chunks = list(itertools.chain.from_iterable(
            wtc_sent for wtc_sent in wtc_sents)
        )

        valid_chunks_tagged = [(status, [wtc for wtc in chunk]) for status, chunk in
                               itertools.groupby(flattened_chunks, lambda word_pos_chunk:
                                word_pos_chunk[2] != 'O')]

        valid_chunks = [' '.join(word.lower()
                                 for word, tag, chunk in wtc_group
                                    if word.lower() not in stopword_list)
                                        for status, wtc_group in valid_chunks_tagged
                                            if status]

        all_chunks.append(valid_chunks)

    return all_chunks

# page 360
chunks = get_chunks(norm_sentences)
print('Chunks:\n', chunks, '\n')

# page 361
from gensim import corpora, models

def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):
    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(idx): value for doc in corpus_tfidf for idx, value in doc}
    weighted_phrases = sorted(weighted_phrases.items(), key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt, 3)) for term, wt in weighted_phrases]

    return weighted_phrases[:top_n]

# top 30 tf-idf weighted keyphrases
print('Top 30 TF-IDF keyphrases:]n', get_tfidf_weighted_keyphrases(sentences=norm_sentences, top_n=30), '\n')

Starting TextNormalizer
Done strip
Done lower
Done stopword
Done char remove
Done contract exp
Done spec char remove

Alice - before and after
[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ] 
  alice adventures wonderland lewis carroll  

Bigrams:
 [('said alice', 123), ('mock turtle', 56), ('march hare', 31), ('said king', 29), ('thought alice', 26), ('white rabbit', 22), ('said hatter', 22), ('said mock', 20), ('said caterpillar', 18), ('said gryphon', 18)] 

Trigrams:
 [('said mock turtle', 20), ('said march hare', 10), ('poor little thing', 6), ('little golden key', 5), ('certainly said alice', 5), ('white kid gloves', 5), ('march hare said', 5), ('mock turtle said', 5), ('know said alice', 4), ('might well say', 4)]
Collocation Finder:

Bigram Association Measures:
[('said', 'alice'), ('mock', 'turtle'), ('march', 'hare'), ('said', 'king'), ('thought', 'alice'), ('said', 'hatter'), ('white', 'rabbit'), ('said', 'mock'), ('said', 'caterpillar'), ('said', 'gryphon')]
[(

In [4]:
# page 362
from gensim.summarization import keywords

# NOTE this code doesn't run in Python 3.7 - one of the sub-packages needs to be updated
# by the maintainers. You can switch to 3.6 or skip these 2 lines.
key_words = keywords(sentences, ratio=1.0, scores=True, lemmatize=False)
print('Gensim\'s summarization model results:\n', [(item, round(score, 3)) for item, score in key_words][:25])

RuntimeError: generator raised StopIteration

## Topic Modeling - Starting on Page 365

In [6]:
import os
import numpy as np
import pandas as pd

# page 365
DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

# page 366
folders = ['nips{0:02}'.format(i) for i in range(0, 13)]
# Read all texts into a list.
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8',
                  errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
# save the papers list, you'll need this a bit later on

print('Length of papers:\n', len(papers), '\n')
print('Paper fragment:\n', papers[0][:1000], '\n')

['idx', 'MATLAB_NOTES', 'nips00', 'nips01', 'nips02', 'nips03', 'nips04', 'nips05', 'nips06', 'nips07', 'nips08', 'nips09', 'nips10', 'nips11', 'nips12', 'nips16', 'orig', 'RAW_DATA_NOTES', 'README_yann']
Length of papers:
 1740 

Paper fragment:
 1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a problem 
from examples using a local learning rule, we prove that the entropy of the 
problem becomes a lower bound for the connectivity of the network. 
INTRODUCTION 
The most distinguishing feature of neural networks is their ability to spon- 
taneousl

## Text Wrangling - starting on page 367

In [8]:
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens
                        if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
    return norm_papers

norm_papers = normalize_corpus(papers)
print('Length of normalized papers:', len(norm_papers), '\n')

Length of normalized papers: 1740 



## Text Representation with Feature Engineering - Starting on Page 369

In [9]:
import gensim

bigram = gensim.models.Phrases(norm_papers, min_count=20, threshold=20, delimiter=b'_')
bigram_model = gensim.models.phrases.Phraser(bigram)

# sample demonstration
print('Bigram model: \n', bigram_model[norm_papers[0]][:50], '\n')

# page 370
norm_corpus_bigrams = [bigram_model[doc] for doc in norm_papers]

# Create a dictionary representation of the documents.
dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
print('Sample word to number mappings:\n', list(dictionary.items())[:15], '\n')
print('Total vocabulary size:', len(dictionary), '\n')

# Filter out words that occur in fewer than 20 documents, or more than 50%
# of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.6)
print('Total vocabulary size:', len(dictionary), '\n')

# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]
print('Bag of words:\n', bow_corpus[1][:50], '\n')

# viewing actual terms and their counts
print('Terms and counts:\n', [(dictionary[idx], freq) for idx, freq in bow_corpus[1][:50]], '\n')

# total papers in the corpus
print('Total number of papers:', len(bow_corpus), '\n')

Bigram model: 
 ['connectivity', 'versus', 'entropy', 'yaser', 'abu_mostafa', 'california_institute', 'technology_pasadena', 'ca_abstract', 'doe', 'connectivity', 'neural_network', 'number', 'synapsis', 'per', 'neuron', 'relate', 'complexity', 'problem', 'handle', 'measured', 'entropy', 'switching', 'theory', 'would', 'suggest', 'relation', 'since', 'boolean_function', 'implemented', 'using', 'circuit', 'low', 'connectivity', 'using', 'two', 'input', 'nand', 'gate', 'however', 'network', 'learns', 'problem', 'example', 'using', 'local', 'learning', 'rule', 'prove', 'entropy', 'problem'] 

Sample word to number mappings:
 [(0, '0a'), (1, '2h'), (2, '2h2'), (3, '2he'), (4, '2n'), (5, '__c'), (6, '_c'), (7, '_k'), (8, 'a2'), (9, 'ability'), (10, 'abu_mostafa'), (11, 'access'), (12, 'accommodate'), (13, 'according'), (14, 'accumulated')] 

Total vocabulary size: 78892 

Total vocabulary size: 7756 

Bag of words:
 [(3, 1), (12, 3), (14, 1), (15, 1), (16, 1), (17, 16), (20, 1), (24, 1), (26

## Latent Semantic Indexing - page 372
This takes a while to run.

In [10]:
TOTAL_TOPICS = 10
from gensim.models import LsiModel
lsi_bow = LsiModel(bow_corpus, id2word=dictionary, num_topics=TOTAL_TOPICS,
                   onepass=True, chunksize=1740, power_iters=1000)

for topic_id, topic in lsi_bow.print_topics(num_topics=10, num_words=20):
    print('Topic #' + str(topic_id+1)+':')
    print(topic, '\n')

for n in range(TOTAL_TOPICS):
    print('Topic #' + str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    for term, wt in lsi_bow.show_topic(n, topn=20):
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50, '\n')

# page 379
term_topic = lsi_bow.projection.u
singular_values = lsi_bow.projection.s
topic_document = (gensim.matutils.corpus2dense(lsi_bow[bow_corpus],
                                               len(singular_values)).T / singular_values).T
print(term_topic.shape, singular_values.shape, topic_document.shape)

document_topics = pd.DataFrame(np.round(topic_document.T, 3),
                               columns=['T' + str(i) for i in range(1, TOTAL_TOPICS+1)])
print(document_topics.head(5))

# page 380
document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(document_topics.columns[np.argsort(
        -np.absolute(document_topics.iloc[document_number].values))[:3]])
    print('Document #' + str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500], '\n')

Topic #1:
0.215*"unit" + 0.212*"state" + 0.187*"training" + 0.177*"neuron" + 0.162*"pattern" + 0.145*"image" + 0.140*"vector" + 0.125*"feature" + 0.122*"cell" + 0.110*"layer" + 0.101*"task" + 0.097*"class" + 0.091*"probability" + 0.089*"signal" + 0.087*"step" + 0.086*"response" + 0.085*"representation" + 0.083*"noise" + 0.082*"rule" + 0.081*"distribution" 

Topic #2:
0.487*"neuron" + 0.396*"cell" + -0.257*"state" + 0.191*"response" + -0.187*"training" + 0.170*"stimulus" + 0.117*"activity" + -0.109*"class" + 0.099*"spike" + 0.097*"pattern" + 0.096*"circuit" + 0.096*"synaptic" + -0.095*"vector" + 0.090*"signal" + 0.090*"firing" + 0.088*"visual" + -0.084*"classifier" + -0.083*"action" + -0.078*"word" + 0.078*"cortical" 

Topic #3:
-0.627*"state" + 0.395*"image" + -0.219*"neuron" + 0.209*"feature" + -0.188*"action" + 0.137*"unit" + 0.131*"object" + -0.130*"control" + 0.129*"training" + -0.109*"policy" + 0.103*"classifier" + 0.090*"class" + -0.081*"step" + -0.081*"dynamic" + 0.080*"classifi

## Implementing LIS Topic Models from Scratch - starting on page 382

In [11]:
td_matrix = gensim.matutils.corpus2dense(corpus=bow_corpus,
                                         num_terms=len(dictionary))
print(td_matrix.shape)
print(td_matrix, '\n')

vocabulary = np.array(list(dictionary.values()))
print('Total vocabulary size:', len(vocabulary))
print(vocabulary, '\n')

from scipy.sparse.linalg import svds

u, s, vt = svds(td_matrix, k=TOTAL_TOPICS, maxiter=10000)
term_topic = u
topic_document = vt
print(term_topic.shape, singular_values.shape, topic_document.shape, '\n')

tt_weights = term_topic.transpose() * singular_values[:, None]
print(tt_weights.shape, '\n')

top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(tt_weights), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([tt_weights[row, columns]
                                  for row, columns in list(zip(np.arange(TOTAL_TOPICS),
                                                               topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #' + str(n+1) + ':')
    print('=' * 50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w)
                           for t, w in zip(terms, weights)],
                          key = lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-' * 50)
    print('Direction 2:', d2)
    print('-' * 50, '\n')

# page 387
document_topics = pd.DataFrame(np.round(topic_document.T, 3),
                               columns=['T' + str(i) for i in
                                        range(1, TOTAL_TOPICS+1)])
document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(document_topics.columns[np.argsort(
        -np.absolute(document_topics.iloc[document_number].values))[:3]])

    print('Document #' + str(document_number) + ':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500], '\n')

(7756, 1740)
[[4. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 

Total vocabulary size: 7756
['2n' '_c' 'a2' ... 'support_vector' 'mozer_jordan' 'kearns_solla'] 

(7756, 10) (10,) (10, 1740) 

(10, 7756) 

Topic #1:
Direction 1: [('word', 629.091), ('vector', 286.945), ('node', 181.498), ('recognition', 177.665), ('sequence', 168.05), ('circuit', 151.514), ('cell', 149.562), ('hmm', 113.762), ('character', 113.551), ('chip', 107.343), ('matrix', 107.113), ('structure', 103.444)]
--------------------------------------------------
Direction 2: [('training', -309.125), ('task', -269.453), ('pattern', -235.696), ('classifier', -190.201), ('control', -169.136), ('rule', -153.281), ('action', -137.514), ('neuron', -127.48)]
-------------------------------------------------- 

Topic #2:
Direction 1: [('node', 297.007), ('circuit', 159.406), ('chip', 126.143), ('classifier', 100.647), ('cu

This next block is not required but shows how to save the embeddings and models.

In [12]:
# save data for use in the next file
import pickle
with open('data/papers.txt', 'wb') as fp:
    pickle.dump(papers, fp)

with open('data/bigram_model.txt', 'wb') as fp:
    pickle.dump(bigram_model, fp)

with open('data/norm_corpus_bigrams.txt', 'wb') as fp:
    pickle.dump(norm_corpus_bigrams, fp)

with open('data/bow_corpus.txt', 'wb') as fp:
    pickle.dump(bow_corpus, fp)

with open('data/dictionary.txt', 'wb') as fp:
    pickle.dump(dictionary, fp)

## Latent Dirichlet Allocation - Starting on Page 391
This takes a while to run (and won't run by itself as a `.py` file unless you enclose it in a `main()` function).

In [13]:
TOTAL_TOPICS = 10
import gensim
# page 391
lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary,
                                    chunksize=1740, alpha='auto', eta='auto',
                                    random_state=42, iterations=500,
                                    num_topics=TOTAL_TOPICS, passes=20,
                                    eval_every=None)
print('LDA Topics with Weights:')
for topic_id, topic in lda_model.show_topics(num_topics=TOTAL_TOPICS, num_words=20):
    print('Topic #' + str(topic_id+1) + ':')
    print(topic, '\n')

# page 393
topics_coherences = lda_model.top_topics(bow_corpus, topn=20)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])
print('Avg. Coherence Score:', avg_coherence_score, '\n')

# page 396
topics_with_wts = [item[0] for item in topics_coherences]
print('LDA Topics with Weights')
print('=' * 50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #' + str(idx+1) + ':')
    print([(term, round(wt, 3)) for wt, term in topic], '\n')

# page 397
print('LDA Topics without Weights')
print('=' * 50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #' + str(idx+1) + ':')
    print([term for wt, term in topic], '\n')

# page 399
cv_coherence_model_lda = \
    gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus,
                                    texts=norm_corpus_bigrams, dictionary=dictionary,
                                    coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda.get_coherence()

umass_coherence_model_lda = \
    gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus,
                                    texts=norm_corpus_bigrams, dictionary=dictionary,
                                    coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda.get_coherence()

perplexity = lda_model.log_perplexity(bow_corpus)

print('Avg. Coherence Score (cv):', avg_coherence_cv)
print('Avg. Coherence Score (umass):', avg_coherence_umass)
print('Model Perpelxity:', perplexity, '\n')

LDA Topics with Weights:
Topic #1:
0.013*"circuit" + 0.012*"chip" + 0.008*"neuron" + 0.008*"analog" + 0.007*"current" + 0.007*"bit" + 0.006*"voltage" + 0.005*"node" + 0.005*"word" + 0.005*"vector" + 0.005*"processor" + 0.004*"implementation" + 0.004*"threshold" + 0.004*"computation" + 0.004*"element" + 0.004*"signal" + 0.004*"pattern" + 0.004*"design" + 0.004*"memory" + 0.004*"parallel" 

Topic #2:
0.030*"image" + 0.012*"object" + 0.011*"feature" + 0.006*"pixel" + 0.006*"visual" + 0.005*"representation" + 0.005*"recognition" + 0.005*"unit" + 0.005*"motion" + 0.005*"face" + 0.005*"task" + 0.004*"view" + 0.004*"layer" + 0.004*"human" + 0.004*"training" + 0.004*"position" + 0.004*"location" + 0.004*"region" + 0.004*"character" + 0.003*"vector" 

Topic #3:
0.020*"neuron" + 0.017*"cell" + 0.012*"response" + 0.010*"stimulus" + 0.007*"spike" + 0.007*"signal" + 0.006*"activity" + 0.006*"synaptic" + 0.005*"firing" + 0.005*"frequency" + 0.005*"pattern" + 0.004*"current" + 0.004*"effect" + 0.004*

I skipped the LDA Models with MALLET since it was basically a repeat of the last section with minimal new information.

## Latent Dirichlet Allocation Tuning - Starting on Page 402
WARNING: this file takes a very long time to run - well over 90 minutes. <BR>

Note that this code will also not work in a `.py` file without being enclosed in a `main()` function.

In [14]:
TOTAL_TOPICS = 10
# NOTE the use of multicore to speed things up.
from tqdm import tqdm
def topic_model_coherence_generator(corpus, texts, dictionary, start_topic_count=2,
                                    end_topic_count=10, step=1, workers=2):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1, step)):
        lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary,
                                               chunksize=1740,
                                               random_state=42, iterations=500,
                                               num_topics=topic_nums, passes=20,
                                               eval_every=None, workers=workers)
        cv_coherence_model_lda = \
            gensim.models.CoherenceModel(model=lda_model, corpus=corpus,
                                         texts=texts, dictionary=dictionary,
                                         coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)
    return models, coherence_scores

# changed end_topic_count to 20 from 30 to speed things up
lda_models, coherence_scores = topic_model_coherence_generator(bow_corpus, norm_corpus_bigrams,
                                                                dictionary, start_topic_count=2,
                                                                end_topic_count=30, step=1)

100%|██████████| 19/19 [1:53:17<00:00, 381.50s/it]


ValueError: arrays must all be same length

In [23]:
print(range(2, 31, 1))
print(lda_models)
print(type(coherence_scores))
rounded_scores = np.round(coherence_scores, 4)
print(rounded_scores)
for this_score in coherence_scores:
    print(np.round(this_score, 4))
coherence_df = pd.DataFrame({'Number of Topics': range(2, 31, 1), 'Coherence Score': np.round(coherence_scores, 4)})
print(coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(10))

range(2, 31)
[<gensim.models.ldamulticore.LdaMulticore object at 0x0000025994E68EB8>, <gensim.models.ldamulticore.LdaMulticore object at 0x0000025986978A58>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259E75439E8>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259F4F0D5C0>, <gensim.models.ldamulticore.LdaMulticore object at 0x0000025995E44438>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259E7D0A0B8>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259E7D0A198>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259903E32B0>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259903E31D0>, <gensim.models.ldamulticore.LdaMulticore object at 0x0000025994D7AC50>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259952F07B8>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259E7D8C400>, <gensim.models.ldamulticore.LdaMulticore object at 0x00000259955B5E10>, <gensim.models.ldamulticore.LdaMulticore object at

ValueError: arrays must all be same length

## Plot and Outputs - Starting on Page 404

In [None]:

import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('fivethirtyeight')

x_ax = range(2, 31, 1)
y_ax = coherence_scores
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c='r')
plt.axhline(y=0.535, c='k', linestyle='--', linewidth=2)
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
y1 = plt.ylabel('Coherence Score')
plt.show()

# page 405
best_model_idx = coherence_df[coherence_df['Number of Topics'] == 20].index[0]
best_lda_model = lda_models[best_model_idx]
print('Number of topics for best LDA model:', best_lda_model.num_topics, '\n')

topics = [[(term, round(wt, 3)) for term, wt in best_lda_model.show_topic(n, topn=20)]
            for n in range(0, best_lda_model.num_topics)]

for idx, topic in enumerate(topics):
    print('Topic #' + str(idx+1) + ':')
    print([term for term, wt in topic])
print('\n')

# page 407
topics_df = pd.DataFrame([[term for term in topic] for topic in topics],
                            columns = ['Term' + str(i) for i in range(1, 21)],
                            index=['Topic ' + str(t) for t in range(1, best_lda_model.num_topics+1)]).T
print(topics_df, '\n')

# page 408
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                            for topic in topics], columns = ['Terms per Topic'],
                            index=['Topic' + str(t)
                                for t in range(1, best_lda_model.num_topics+1)])
print(topics_df, '\n')

## Interpreting model results - starting on page 409

In [None]:
tm_results = best_lda_model[bow_corpus]
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                    for topics in tm_results]
print('First five topics:', corpus_topics[:5],'\n')

corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(papers))
corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Topic Desc'] = [topics_df.iloc[t[0]]['Terms per Topic']
                                    for t in corpus_topics]
corpus_topic_df['Paper'] = papers

## Dominant Topics

### Distribution Across Corpus - starting on page 410

In [None]:
pd.set_option('display.max_colwidth', 200)
topics_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({'Dominant Topic': {
    'Doc Count': np.size, '% Total Docs': np.size}})
topics_stats_df = topics_stats_df['Dominant Topic'].reset_index()
topics_stats_df['% Total Docs'] = topics_stats_df['% Total Docs'].apply(
    lambda row: round((row*100) / len(papers), 2))
topics_stats_df['Topic Desc'] = [topics_df.iloc[t]['Terms per Topic']
                                    for t in range(len(topics_stats_df))]
print('Topic Status DF:\n', topics_stats_df)

# Dominant Topics in Specific Research Papers - page 412
pd.set_option('display.max_colwidth', 200)
print(corpus_topic_df[corpus_topic_df['Document'].isin([681, 9, 392, 1622, 17, 906,
                                                    996, 503, 13, 733])], '\n')

# Relevant Research Papers per Topic Based on Dominance - page 413
print(corpus_topic_df.groupby('Dominant Topic').apply(
    lambda topic_set: (topic_set.sort_values(by=['Contribution %'],
                                                ascending=False).iloc[0])))

## Predicting Topics - Starting on Page 415

In [6]:
import pandas as pd
import pickle # only required if you start here
# This is the author's normalization file - best to use for this one script
from normalization import normalize_corpus\

ModuleNotFoundError: No module named 'HTMLParser'

### Import Models (if you want to start here)

In [7]:
# Load from earlier work
with open("data/bigram_model.txt", "rb") as fp:
    bigram_model = pickle.load(fp)
with open("data/dictionary.txt", "rb") as fp:
    dictionary = pickle.load(fp)
with open("data/best_lda_model.txt", "rb") as fp:
    best_lda_model = pickle.load(fp)
with open("data/topics_df.txt", "rb") as fp:
    topics_df = pickle.load(fp)

FileNotFoundError: [Errno 2] No such file or directory: 'data/best_lda_model.txt'

### Predicting Topics for New Research Papers

In [8]:
import glob
new_paper_files = glob.glob('nipstxt/nips16/nips16*.txt')
new_papers = []

for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)

print('Total New Papers:', len(new_papers), '\n')

def text_preprocessing_pipeline(documents, normalizer_fn, bigram_model):
    norm_docs = normalizer_fn(documents)
    norm_docs_bigrams = bigram_model[norm_docs]
    return norm_docs_bigrams

def bow_features_pipeline(tokenized_docs, dictionary):
    paper_bow_features = [dictionary.doc2bow(text) for text in
                          tokenized_docs]
    return paper_bow_features

norm_new_papers = text_preprocessing_pipeline(documents=new_papers, normalizer_fn=normalize_corpus,
                                              bigram_model=bigram_model)
norm_bow_features = bow_features_pipeline(tokenized_docs=norm_new_papers, dictionary=dictionary)

print(norm_new_papers[0][:30], '\n')
print(norm_bow_features[0][:30], '\n')

# page 416
def get_topic_predictions(topic_model, corpus, topn=3):
    topic_predictions = topic_model[corpus]
    best_topics = [[(topic, round(wt, 3))
                    for topic, wt in sorted(topic_predictions[i],
                                            key=lambda row: -row[i])[:topn]]
                    for i in range(len(topic_predictions))]
    return best_topics

# putting the function in action
topic_preds = get_topic_predictions(topic_model=best_lda_model,
                                    corpus=norm_bow_features, topn=2)
print('Topic Predictions\n', topic_preds)

# page 417
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[top_num+1 for top_num, wt in item]
                                 for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics']\
    .apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in
                                [[round(wt*100, 2) for topic_num, wt in item]
                                 for item in topic_preds]
                                for topic_wt in topic_list]
results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic']
                            for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200]
                            for i in results_df.index_values]
pd.set_option('display.max_colwidth', 300)
print('Results for each paper\n', results_df, '\n')

Total New Papers: 4 



NameError: name 'normalize_corpus' is not defined

## Automated Document Summarization - starting on page 436

In [9]:
# I took the description from:
# Wikipedia. (2020). The Elder Scross V: Skyrim. https://en.wikipedia.org/wiki/The_Elder_Scrolls_V:_Skyrim
DOCUMENT = """
The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by 
ethesda Softworks. It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: 
Oblivion, and was released worldwide for Microsoft Windows, PlayStation 3, and Xbox 360 on November 11, 2011.

The game's main story revolves around the player's character, the Dragonborn, on their quest to defeat Alduin the 
World-Eater, a dragon who is prophesied to destroy the world. The game is set 200 years after the events of Oblivion 
and takes place in Skyrim, the northernmost province of Tamriel. Over the course of the game, the player completes 
quests and develops the character by improving skills. The game continues the open-world tradition of its 
predecessors by allowing the player to travel anywhere in the game world at any time, and to ignore or postpone the 
main storyline indefinitely.

Skyrim was developed using the Creation Engine, rebuilt specifically for the game. The team opted for a unique and 
more diverse open world than Oblivion's Imperial Province of Cyrodiil, which game director and executive producer 
Todd Howard considered less interesting by comparison. The game was released to critical acclaim, with reviewers 
particularly mentioning the character advancement and setting, and is considered to be one of the greatest video 
games of all time. Nonetheless it received some criticism, predominantly for its melee combat and numerous 
technical issues present at launch. The game shipped over seven million copies to retailers within the first week 
of its release, and over 30 million copies on all platforms as of November 2016, making it one of the best selling 
video games in history.

Three downloadable content (DLC) add-ons were released—Dawnguard, Hearthfire, and Dragonborn—which were repackaged 
into The Elder Scrolls V: Skyrim – Legendary Edition and released in June 2013. The Elder Scrolls V: Skyrim – 
Special Edition is a remastered version of the game released for Windows, Xbox One, and PlayStation 4 in October 
2016. It includes all three DLC expansions and a graphical upgrade, along with additional features such as modding 
capabilities on consoles. Versions were released in November 2017 for the Nintendo Switch and PlayStation VR, and 
a stand-alone virtual reality (VR) version for Windows was released in April 2018. These versions were based on 
the remastered release, but the Switch version's graphics upgrade was relative to its hardware capabilities, and 
it did not include the modding features.
"""

# page 438
import re
DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT)
DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
DOCUMENT = DOCUMENT.strip()

from gensim.summarization import summarize
print('Summarized document:\n', summarize(DOCUMENT, ration=0.2, split=False), '\n')
print('Limited document summary\n', summarize(DOCUMENT, word_count=75, split=False), '\n')

# Text Wrangling - page 439
import nltk
import numpy as np

stop_words = nltk.corpus.stopwords.words('english')

TypeError: summarize() got an unexpected keyword argument 'ration'