# Text Summarization and Topic Models
* Text Summarization and Information Extraction
* Important Concepts
* Keyphrase Extractions
    1. Collocations
    2. Weighted Tag-Based Phrase Extraction
* Topic Modeling on Research Papers
    1. The Main Objective
    2. Data Retrieval
    3. Load and View Dataset
    4. Basic Text Wrangling
* Topic Models with Gensim
    1. Text Representation with Feature Engineering
    2. Latent Semantic Indexing
    3. Implementing LSI Topic Models from Scratch
    4. Latent Dirichlet Allocation
    5. LDA Models with MALLET
    6. LDA Tuning: Finding the Optimal Number of Topics
    7. Interpreting Topic Model Results
    8. Predicting Topics for New Research Papers
* Topic Models with Scikit-Learn
    1. Text Representation with Feature Engineering
    2. Latent Semantic Indexing
    3. Latent Dirichlet Allocation
    4. Non-Negative Matrix Factorization
    5. Predicting Topics for New Research Papers
    6. Visualizing Topic Models
* Automated Document Summarization
    1. Text Wrangling
    2. Text Representation with Feature Engineering
    3. Latent Semantic Analysis
    4. TextRank

In [12]:
# if spacy doesn't run
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [13]:
# if nltk error
#import nltk
#nltk.download('all')

In [14]:
# import gensim
#!pip install gensim

In [99]:
# find current working directory
import os
os.getcwd()

# for azure ml
path_to_users = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/bellepracticevm/code/Users'

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/bellepracticevm/code/Users/LearningCode/NLP_Learning'

## Important Concepts

In [16]:
# extract top k singular values and return corresponding U, S, & V matrices
from scipy.sparse.linalg import svds

def low_rank_svd(matrix, singular_count=2):
    u,s,vt = svds(matrix, k=singular_count)
    return u,s,vt

## Keyphrase Extraction

In [17]:
## Collocations
from nltk.corpus import gutenberg
import text_normalizer as tn
import nltk
from operator import itemgetter

# load corpus
alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = list(filter(None,
                         tn.normalize_corpus(alice, text_lemmatization=False)))

# print and compare first line
print(alice[0], '\n', norm_alice[0])

[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ] 
 alice adventures wonderland lewis carroll


In [18]:
def compute_ngrams(sequence, n):
    return list(
            zip(*(sequence[index:]
                  for index in range(n))))

# test function
compute_ngrams([1,2,3,4], 2) # bi-grams
compute_ngrams([1,2,3,4], 3) # tri-grams

[(1, 2, 3), (2, 3, 4)]

In [19]:
# function to flatten corpus into one big string of text
def flatten_corpus(corpus):
    return ' '.join([document.strip()
                    for document in corpus])

# get top n-grams for corpus of text
def get_top_ngrams(corpus, ngram_val=1, limit=5):
    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
    
    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq)
                     for text, freq in sorted_ngrams]
    return sorted_ngrams

In [20]:
# top 10 bigrams
get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10)

[('said alice', 123),
 ('mock turtle', 56),
 ('march hare', 31),
 ('said king', 29),
 ('thought alice', 26),
 ('white rabbit', 22),
 ('said hatter', 22),
 ('said mock', 20),
 ('said caterpillar', 18),
 ('said gryphon', 18)]

In [21]:
# top 10 trigrams
get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10)

[('said mock turtle', 20),
 ('said march hare', 10),
 ('poor little thing', 6),
 ('little golden key', 5),
 ('certainly said alice', 5),
 ('white kid gloves', 5),
 ('march hare said', 5),
 ('mock turtle said', 5),
 ('know said alice', 4),
 ('might well say', 4)]

In [22]:
# use NLTK's collocation finders
# bigrams
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents([item.split() for item in norm_alice])
finder

<nltk.collocations.BigramCollocationFinder at 0x7f351d3db6d8>

In [23]:
bigram_measures = BigramAssocMeasures()

# raw frequencies
finder.nbest(bigram_measures.raw_freq, 10)

[('said', 'alice'),
 ('mock', 'turtle'),
 ('march', 'hare'),
 ('said', 'king'),
 ('thought', 'alice'),
 ('said', 'hatter'),
 ('white', 'rabbit'),
 ('said', 'mock'),
 ('said', 'caterpillar'),
 ('said', 'gryphon')]

In [24]:
# pointwise mutual information
finder.nbest(bigram_measures.pmi, 10)

[('abide', 'figures'),
 ('acceptance', 'elegant'),
 ('accounting', 'tastes'),
 ('accustomed', 'usurpation'),
 ('act', 'crawling'),
 ('adjourn', 'immediate'),
 ('adoption', 'energetic'),
 ('affair', 'trusts'),
 ('agony', 'terror'),
 ('alarmed', 'proposal')]

In [25]:
# trigrams
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents([item.split() for item in norm_alice])

trigram_measures = TrigramAssocMeasures()

In [26]:
# raw frequencies
finder.nbest(trigram_measures.raw_freq, 10)

[('said', 'mock', 'turtle'),
 ('said', 'march', 'hare'),
 ('poor', 'little', 'thing'),
 ('little', 'golden', 'key'),
 ('march', 'hare', 'said'),
 ('mock', 'turtle', 'said'),
 ('white', 'kid', 'gloves'),
 ('beau', 'ootiful', 'soo'),
 ('certainly', 'said', 'alice'),
 ('might', 'well', 'say')]

In [27]:
# pointwise mutual information
finder.nbest(trigram_measures.pmi, 10)

[('accustomed', 'usurpation', 'conquest'),
 ('adjourn', 'immediate', 'adoption'),
 ('adoption', 'energetic', 'remedies'),
 ('ancient', 'modern', 'seaography'),
 ('apple', 'roast', 'turkey'),
 ('arithmetic', 'ambition', 'distraction'),
 ('brother', 'latin', 'grammar'),
 ('canvas', 'bag', 'tied'),
 ('cherry', 'tart', 'custard'),
 ('circle', 'exact', 'shape')]

In [28]:
## Weighted Tag-Based Phrase Extraction
data = open('data/elephants.txt', 'r+').readlines()
sentences = nltk.sent_tokenize(data[0])
len(sentences)

29

In [29]:
# viewing the first three lines
sentences[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea.',
 'Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus).',
 'Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia.']

In [30]:
norm_sentences = tn.normalize_corpus(sentences, text_lower_case=False, text_stemming=False,
                                     text_lemmatization=False, stopword_removal=False)
norm_sentences[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea',
 'Three species are currently recognised the African bush elephant Loxodonta africana the African forest elephant L cyclotis and the Asian elephant Elephas maximus',
 'Elephants are scattered throughout subSaharan Africa South Asia and Southeast Asia']

In [31]:
import itertools
stopwords = nltk.corpus.stopwords.words('english')

def get_chunks(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', stopword_list=stopwords):
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    for sentence in sentences:
        tagged_sents = [nltk.pos_tag(nltk.word_tokenize(sentence))]
        chunks = [chunker.parse(tagged_sent)
                     for tagged_sent in tagged_sents]
        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                        for chunk in chunks]
        flattened_chunks = list(itertools.chain.from_iterable(wtc_sent for wtc_sent in wtc_sents))
        valid_chunks_tagged = [(status, [wtc for wtc in chunk])
                                    for status, chunk in itertools.groupby(flattened_chunks,
                                                      lambda word_pos_chunk: 
                                                      word_pos_chunk[2] != 'O')]
        valid_chunks = [' '.join(word.lower()
                                 for word, tag, chunk in wtc_group
                                     if word.lower() not in stopword_list)
                                        for status, wtc_group in valid_chunks_tagged if status]
        all_chunks.append(valid_chunks)
    return all_chunks

In [32]:
chunks = get_chunks(norm_sentences)
chunks

[['elephants', 'large mammals', 'family elephantidae', 'order proboscidea'],
 ['species',
  'african bush elephant loxodonta',
  'african forest elephant l cyclotis',
  'asian elephant elephas maximus'],
 ['elephants', 'subsaharan africa south asia', 'southeast asia'],
 ['elephantidae',
  'family',
  'order proboscidea',
  'extinct members',
  'order',
  'deinotheres gomphotheres mammoths',
  'mastodons'],
 ['elephants',
  'several distinctive features',
  'long trunk',
  'proboscis',
  'many purposes',
  'water',
  'grasping objects'],
 ['incisors', 'tusks', 'weapons', 'tools', 'objects'],
 ['elephants', 'flaps', 'body temperature'],
 ['pillarlike legs', 'great weight'],
 ['african elephants',
  'ears',
  'backs',
  'asian elephants',
  'ears',
  'convex',
  'level backs'],
 ['elephants', 'different habitats', 'savannahs forests deserts', 'marshes'],
 ['water'],
 ['keystone species', 'impact', 'environments'],
 ['animals',
  'distance',
  'elephants',
  'predators',
  'lions tigers hy

In [33]:
from gensim import corpora, models

def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10):
    valid_chunks = get_chunks(sentences, grammar=grammar)
    
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(idx): value for doc in corpus_tfidf for idx, value in doc}
    weighted_phrases = sorted(weighted_phrases.items(),
                              key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt,3)) for term, wt in weighted_phrases]
    
    return weighted_phrases[:top_n]

In [34]:
# top 30 tf-idf weighted keyphrases
get_tfidf_weighted_keyphrases(sentences=norm_sentences, top_n=30)

[('water', 1.0),
 ('asia', 0.807),
 ('wild', 0.764),
 ('great weight', 0.707),
 ('pillarlike legs', 0.707),
 ('southeast asia', 0.693),
 ('subsaharan africa south asia', 0.693),
 ('body temperature', 0.693),
 ('flaps', 0.693),
 ('fissionfusion society', 0.693),
 ('multiple family groups', 0.693),
 ('art folklore religion literature', 0.693),
 ('popular culture', 0.693),
 ('ears', 0.681),
 ('males', 0.653),
 ('males bulls', 0.653),
 ('family elephantidae', 0.607),
 ('large mammals', 0.607),
 ('years', 0.607),
 ('environments', 0.577),
 ('impact', 0.577),
 ('keystone species', 0.577),
 ('cetaceans', 0.577),
 ('elephant intelligence', 0.577),
 ('primates', 0.577),
 ('dead individuals', 0.577),
 ('kind', 0.577),
 ('selfawareness', 0.577),
 ('different habitats', 0.57),
 ('marshes', 0.57)]

In [35]:
from gensim.summarization import keywords

key_words = keywords(data[0], ratio=1.0, scores=True, lemmatize=True)
[(item, round(score,3)) for item, score in key_words][:25]

[('african bush elephant', 0.261),
 ('including', 0.141),
 ('family', 0.137),
 ('cow', 0.124),
 ('forests', 0.108),
 ('female', 0.103),
 ('asia', 0.102),
 ('objects', 0.098),
 ('sight', 0.098),
 ('ivory', 0.098),
 ('tigers', 0.098),
 ('males', 0.088),
 ('religion', 0.087),
 ('folklore', 0.087),
 ('known', 0.087),
 ('larger ears', 0.085),
 ('water', 0.075),
 ('highly recognisable', 0.075),
 ('breathing lifting', 0.074),
 ('flaps', 0.073),
 ('africa', 0.072),
 ('gomphotheres', 0.072),
 ('animals tend', 0.071),
 ('success', 0.071),
 ('south', 0.07)]

## Topic Modeling on Research Papers

In [36]:
## Data Retrieval
#!wget https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz

In [37]:
# extract dataset
#!tar -xzf nips12raw_str602.tgz

In [40]:
import os
import numpy as np
import pandas as pd

DATA_PATH = path_to_users + '/nipstxt/'
print(os.listdir(DATA_PATH))

['nips00', 'nips01', 'nips02', 'nips03', 'nips04', 'nips05', 'nips06', 'nips07', 'nips08', 'nips09', 'nips10', 'nips11', 'nips12']


In [41]:
## Load and View Dataset
folders = ["nips{0:02}".format(i) for i in range(0,13)]
# read all texts into a list
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)
len(papers)

1719

In [42]:
print(papers[0][:1000])

1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a problem 
from examples using a local learning rule, we prove that the entropy of the 
problem becomes a lower bound for the connectivity of the network. 
INTRODUCTION 
The most distinguishing feature of neural networks is their ability to spon- 
taneously learn the desired function from 'training' samples, i.e., their ability 
to program themselves. Clearly, a given neural network cannot just learn any 
function, there must be some restrictions on which networks can learn which 
functions. One obv

In [43]:
%%time
## Basic Text Wrangling
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
        
    return norm_papers

norm_papers = normalize_corpus(papers)
print(len(norm_papers))

1719
CPU times: user 23.9 s, sys: 51.7 ms, total: 23.9 s
Wall time: 23.9 s


In [44]:
# viewing a processed paper
print(norm_papers[0][:50])

['connectivity', 'versus', 'entropy', 'yaser', 'abu', 'mostafa', 'california', 'institute', 'technology', 'pasadena', 'ca', 'abstract', 'doe', 'connectivity', 'neural', 'network', 'number', 'synapsis', 'per', 'neuron', 'relate', 'complexity', 'problem', 'handle', 'measured', 'entropy', 'switching', 'theory', 'would', 'suggest', 'relation', 'since', 'boolean', 'function', 'implemented', 'using', 'circuit', 'low', 'connectivity', 'using', 'two', 'input', 'nand', 'gate', 'however', 'network', 'learns', 'problem', 'example', 'using']


## Topic Models with Gensim

In [45]:
## Text Representation with Feature Engineering
import gensim

bigram = gensim.models.Phrases(norm_papers, min_count=20, 
                               threshold=20, delimiter=b'_') # higher threshold fewer phrases
bigram_model = gensim.models.phrases.Phraser(bigram)

# sample demonstration
print(bigram_model[norm_papers[0]][:50])

['connectivity', 'versus', 'entropy', 'yaser', 'abu_mostafa', 'california_institute', 'technology_pasadena', 'ca_abstract', 'doe', 'connectivity', 'neural_network', 'number', 'synapsis', 'per', 'neuron', 'relate', 'complexity', 'problem', 'handle', 'measured', 'entropy', 'switching', 'theory', 'would', 'suggest', 'relation', 'since', 'boolean_function', 'implemented', 'using', 'circuit', 'low', 'connectivity', 'using', 'two', 'input', 'nand', 'gate', 'however', 'network', 'learns', 'problem', 'example', 'using', 'local', 'learning', 'rule', 'prove', 'entropy', 'problem']


In [46]:
norm_corpus_bigrams = [bigram_model[doc] for doc in norm_papers]

# create a dictionary representation of the documents
dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size', len(dictionary))

Sample word to number mappings: [(0, '0a'), (1, '2h'), (2, '2h2'), (3, '2he'), (4, '2n'), (5, '__c'), (6, '_c'), (7, '_k'), (8, 'a2'), (9, 'ability'), (10, 'abu_mostafa'), (11, 'access'), (12, 'accommodate'), (13, 'according'), (14, 'accumulated')]
Total Vocabulary Size 78252


In [47]:
# filer out words that occur less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.6)
print('Total Vocabulary Size:', len(dictionary))

Total Vocabulary Size: 7692


In [48]:
# transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]
print(bow_corpus[1][:50])

[(3, 1), (12, 3), (14, 1), (15, 1), (16, 1), (17, 16), (20, 1), (24, 1), (26, 1), (31, 3), (35, 1), (36, 1), (40, 3), (41, 5), (42, 1), (48, 1), (53, 3), (55, 1), (56, 2), (58, 1), (60, 3), (63, 5), (64, 4), (65, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 3), (82, 1), (83, 4), (84, 1), (85, 1), (86, 2), (93, 1), (95, 2), (96, 3), (105, 1), (109, 1), (118, 2), (119, 4), (120, 2), (123, 2), (126, 1), (127, 1), (131, 1), (132, 1), (134, 6), (135, 1), (143, 1)]


In [49]:
# viewing actual terms and their counts
print([(dictionary[idx], freq) for idx, freq in bow_corpus[1][:50]])

[('ability', 1), ('aip', 3), ('although', 1), ('american_institute', 1), ('amount', 1), ('analog', 16), ('appears', 1), ('architecture', 1), ('aspect', 1), ('available', 3), ('become', 1), ('becomes', 1), ('binary', 3), ('biological', 5), ('bit', 1), ('cannot', 1), ('circuit', 3), ('collective', 1), ('compare', 2), ('complex', 1), ('computing', 3), ('conference', 5), ('connected', 4), ('connectivity', 2), ('define', 1), ('defined', 1), ('defines', 1), ('definition', 1), ('denker', 3), ('designed', 1), ('desired', 4), ('diagonal', 1), ('difference', 1), ('directly', 2), ('ed', 1), ('el', 2), ('element', 3), ('equivalent', 1), ('eventually', 1), ('feature', 2), ('final', 4), ('find', 2), ('fixed', 2), ('frequency', 1), ('furthermore', 1), ('generating', 1), ('get', 1), ('global', 6), ('go', 1), ('hence', 1)]


In [50]:
# total papers in the corpus
print('Total number of papers:', len(bow_corpus))

Total number of papers: 1719


In [51]:
%%time
## Latent Semantic Indexing
TOTAL_TOPICS = 10
lsi_bow = gensim.models.LsiModel(bow_corpus, id2word=dictionary, num_topics=TOTAL_TOPICS, 
                                 onepass=True, chunksize=1740, power_iters=1000)

CPU times: user 5min 57s, sys: 3min 41s, total: 9min 38s
Wall time: 3min 9s


In [52]:
for topic_id, topic in lsi_bow.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.217*"unit" + 0.204*"state" + 0.189*"training" + 0.180*"neuron" + 0.164*"pattern" + 0.143*"image" + 0.140*"vector" + 0.123*"feature" + 0.123*"cell" + 0.112*"layer" + 0.100*"task" + 0.095*"class" + 0.090*"probability" + 0.090*"signal" + 0.087*"response" + 0.086*"step" + 0.085*"representation" + 0.083*"rule" + 0.083*"noise" + 0.081*"node"

Topic #2:
-0.501*"neuron" + -0.401*"cell" + 0.206*"training" + 0.193*"state" + -0.190*"response" + -0.172*"stimulus" + -0.117*"activity" + 0.115*"class" + 0.102*"vector" + -0.101*"spike" + -0.099*"circuit" + -0.098*"synaptic" + 0.095*"classifier" + -0.092*"firing" + -0.089*"signal" + -0.086*"pattern" + -0.085*"visual" + 0.084*"word" + -0.078*"cortical" + 0.076*"task"

Topic #3:
-0.626*"state" + 0.411*"image" + 0.210*"feature" + -0.194*"neuron" + -0.188*"action" + 0.144*"object" + 0.139*"unit" + -0.135*"control" + 0.100*"training" + -0.095*"policy" + 0.092*"classifier" + -0.089*"step" + 0.085*"layer" + -0.082*"dynamic" + 0.073*"representation

In [53]:
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    for term, wt in lsi_bow.show_topic(n, topn=20):
        if wt >= 0:
            d1.append((term, round(wt,3)))
        else:
            d2.append((term, round(wt,3)))
    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('unit', 0.217), ('state', 0.204), ('training', 0.189), ('neuron', 0.18), ('pattern', 0.164), ('image', 0.143), ('vector', 0.14), ('feature', 0.123), ('cell', 0.123), ('layer', 0.112), ('task', 0.1), ('class', 0.095), ('probability', 0.09), ('signal', 0.09), ('response', 0.087), ('step', 0.086), ('representation', 0.085), ('rule', 0.083), ('noise', 0.083), ('node', 0.081)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('training', 0.206), ('state', 0.193), ('class', 0.115), ('vector', 0.102), ('classifier', 0.095), ('word', 0.084), ('task', 0.076)]
--------------------------------------------------
Direction 2: [('neuron', -0.501), ('cell', -0.401), ('response', -0.19), ('stimulus', -0.172), ('activity', -0.117), ('spike', -0.101), ('circuit', -0.099), ('synaptic', -0.098), ('firing', -0.092), ('signal', -0.089), ('pattern', -0.086), ('visual', -0.085), ('cortical', 

In [54]:
# get U, S, VT matrices from topic model
term_topic = lsi_bow.projection.u
singular_values = lsi_bow.projection.s
topic_document = (gensim.matutils.corpus2dense(lsi_bow[bow_corpus], len(singular_values)).T / singular_values).T
term_topic.shape, singular_values.shape, topic_document.shape

((7692, 10), (10,), (10, 1719))

In [55]:
# document topic matrix for our LSI model
document_topics = pd.DataFrame(np.round(topic_document.T, 3), 
                               columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
document_topics.head(5)

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
0,0.02,-0.02,-0.01,-0.01,-0.02,0.03,0.0,0.02,0.01,-0.01
1,0.04,-0.03,-0.02,0.01,-0.02,0.06,-0.02,-0.01,-0.02,-0.01
2,0.02,-0.0,-0.02,-0.01,-0.01,0.02,-0.01,0.02,0.0,0.01
3,0.03,-0.04,-0.01,0.0,-0.04,0.05,0.02,0.04,0.01,-0.03
4,0.04,0.0,-0.02,0.0,-0.02,0.02,-0.03,0.02,-0.05,0.03


In [56]:
document_numbers = [13, 250, 500]
for document_number in document_numbers:
    top_topics = list(document_topics.
                      columns[np.argsort(-np.absolute(document_topics.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

Document #13:
Dominant Topics (top 3): ['T3', 'T8', 'T9']
Paper Summary:
137 
On the 
Power of Neural Networks for 
Solving Hard Problems 
Jehoshua Bruck 
Joseph W. Goodman 
Information Systems Laboratory 
Department of Electrical Engineering 
Stanford University 
Stanford, CA 94305 
Abstract 
This paper deals with a neural network model in which each neuron 
performs a threshold logic function. An important property of the model 
is that it always converges to a stable state when operating in a serial 
mode [2,5]. This property is the basis of the potential applicat

Document #250:
Dominant Topics (top 3): ['T9', 'T1', 'T8']
Paper Summary:
542 Kassebaum, Tenorio and Schaefers 
The Cocktail Party Problem: 
Speech/Data Signal Separation Comparison 
between Backpropagation and SONN 
John Kassebaum 
jakec.ecn.purdue.edu 
Manoel Fernando Tenorio 
tenorioee.ecn.purdue.edu 
Chrlstoph Schaefers 
Parallel Distributed Structures Laboratory 
School of Electrical Engineering 
Purdue University 

In [57]:
## Implementing LSI Topic Models from Scratch
td_matrix = gensim.matutils.corpus2dense(corpus=bow_corpus, num_terms=len(dictionary))
print(td_matrix.shape)
td_matrix

(7692, 1719)


array([[4., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [58]:
vocabulary = np.array(list(dictionary.values()))
print('Total vocabulary size:', len(vocabulary))
vocabulary

Total vocabulary size: 7692


array(['2n', '_c', 'a2', ..., 'smola', 'support_vector', 'mozer_jordan'],
      dtype='<U28')

In [59]:
from scipy.sparse.linalg import svds
u, s, vt = svds(td_matrix, k=TOTAL_TOPICS, maxiter=10000)
term_topic = u
singular_values = s
topic_document = vt
term_topic.shape, singular_values.shape, topic_document.shape

((7692, 10), (10,), (10, 1719))

In [60]:
tt_weights = term_topic.transpose() * singular_values[:,None]
tt_weights.shape

(10, 7692)

In [61]:
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(tt_weights), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([tt_weights[row, columns]
                                 for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t,w) for t, w in zip(terms, weights)], key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))
    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('word', 192.091), ('vector', 87.984), ('recognition', 54.779), ('sequence', 51.589), ('node', 44.675), ('cell', 44.443), ('circuit', 37.746), ('hmm', 35.687), ('character', 33.4), ('matrix', 32.903), ('structure', 31.385), ('phoneme', 29.164)]
--------------------------------------------------
Direction 2: [('training', -92.608), ('task', -82.594), ('pattern', -65.608), ('classifier', -58.539), ('control', -53.672), ('rule', -48.237), ('action', -45.226), ('neuron', -33.821)]
--------------------------------------------------

Topic #2:
Direction 1: [('node', 173.925), ('circuit', 99.14), ('chip', 78.507), ('current', 60.015), ('voltage', 58.008), ('classifier', 51.738), ('control', 45.65), ('analog', 41.497), ('rule', 39.847), ('layer', 38.983), ('tree', 31.974)]
--------------------------------------------------
Direction 2: [('neuron', -72.423), ('word', -64.314), ('stimulus', -64.259), ('distribution', -50.538), ('feature', -46.737), ('state', -33.514), ('r

In [62]:
document_topics = pd.DataFrame(np.round(topic_document.T, 3), 
                               columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(document_topics.
                      columns[np.argsort(-np.absolute(document_topics.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

Document #13:
Dominant Topics (top 3): ['T8', 'T3', 'T2']
Paper Summary:
137 
On the 
Power of Neural Networks for 
Solving Hard Problems 
Jehoshua Bruck 
Joseph W. Goodman 
Information Systems Laboratory 
Department of Electrical Engineering 
Stanford University 
Stanford, CA 94305 
Abstract 
This paper deals with a neural network model in which each neuron 
performs a threshold logic function. An important property of the model 
is that it always converges to a stable state when operating in a serial 
mode [2,5]. This property is the basis of the potential applicat

Document #250:
Dominant Topics (top 3): ['T2', 'T3', 'T10']
Paper Summary:
542 Kassebaum, Tenorio and Schaefers 
The Cocktail Party Problem: 
Speech/Data Signal Separation Comparison 
between Backpropagation and SONN 
John Kassebaum 
jakec.ecn.purdue.edu 
Manoel Fernando Tenorio 
tenorioee.ecn.purdue.edu 
Chrlstoph Schaefers 
Parallel Distributed Structures Laboratory 
School of Electrical Engineering 
Purdue University

In [63]:
%%time
## Latent Dirichlet Allocation
lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary, 
                                   chunksize=1740, alpha='auto', eta='auto', random_state=42, 
                                   iterations=500, num_topics=TOTAL_TOPICS, 
                                   passes=20, eval_every=None)

CPU times: user 1min 13s, sys: 4.44 s, total: 1min 18s
Wall time: 1min 12s


In [64]:
# view topics in trained topic model
for topic_id, topic in lda_model.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.017*"circuit" + 0.010*"chip" + 0.010*"analog" + 0.009*"voltage" + 0.009*"current" + 0.006*"threshold" + 0.006*"bit" + 0.006*"code" + 0.005*"vector" + 0.005*"neuron" + 0.005*"element" + 0.005*"computation" + 0.005*"node" + 0.004*"gate" + 0.004*"signal" + 0.004*"memory" + 0.004*"transistor" + 0.004*"size" + 0.004*"device" + 0.004*"synapse"

Topic #2:
0.016*"control" + 0.008*"state" + 0.008*"task" + 0.007*"controller" + 0.006*"position" + 0.005*"training" + 0.005*"robot" + 0.005*"prediction" + 0.004*"dynamic" + 0.004*"motor" + 0.004*"trajectory" + 0.004*"movement" + 0.004*"move" + 0.004*"environment" + 0.003*"goal" + 0.003*"step" + 0.003*"adaptive" + 0.003*"search" + 0.003*"arm" + 0.003*"region"

Topic #3:
0.008*"distribution" + 0.007*"class" + 0.006*"probability" + 0.006*"training" + 0.005*"variable" + 0.005*"estimate" + 0.005*"sample" + 0.005*"approximation" + 0.004*"gaussian" + 0.004*"prior" + 0.004*"linear" + 0.004*"vector" + 0.003*"bound" + 0.003*"prediction" + 0.003*"den

In [65]:
# view overall mean coherence score of model
topics_coherences = lda_model.top_topics(bow_corpus, topn=20)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])
print('Avg. Coherence Score:', avg_coherence_score)

Avg. Coherence Score: -1.091462223513131


In [66]:
# output of topic models as tuples of terms and weights
topics_with_wts = [item[0] for item in topics_coherences]
print('LDA Topics with Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([(term, round(wt,3)) for wt, term in topic])
    print()

LDA Topics with Weights
Topic #1:
[('classifier', 0.014), ('layer', 0.01), ('neuron', 0.009), ('memory', 0.008), ('processor', 0.008), ('training', 0.008), ('bit', 0.007), ('pattern', 0.007), ('chip', 0.007), ('node', 0.007), ('connection', 0.007), ('classification', 0.005), ('architecture', 0.005), ('application', 0.005), ('parallel', 0.005), ('unit', 0.005), ('machine', 0.004), ('vector', 0.004), ('computer', 0.004), ('feature', 0.004)]

Topic #2:
[('neuron', 0.018), ('cell', 0.016), ('response', 0.01), ('stimulus', 0.009), ('activity', 0.008), ('pattern', 0.007), ('spike', 0.005), ('unit', 0.005), ('signal', 0.005), ('synaptic', 0.005), ('neural', 0.004), ('cortical', 0.004), ('effect', 0.004), ('connection', 0.004), ('visual', 0.004), ('frequency', 0.004), ('firing', 0.004), ('layer', 0.004), ('et_al', 0.004), ('cortex', 0.003)]

Topic #3:
[('unit', 0.02), ('training', 0.013), ('pattern', 0.009), ('hidden_unit', 0.007), ('rule', 0.007), ('node', 0.007), ('net', 0.007), ('task', 0.0

In [67]:
# view topics as a list of terms without weights, understand context or theme of each topic
print('LDA Topics without Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([term for wt, term in topic])
    print()

LDA Topics without Weights
Topic #1:
['classifier', 'layer', 'neuron', 'memory', 'processor', 'training', 'bit', 'pattern', 'chip', 'node', 'connection', 'classification', 'architecture', 'application', 'parallel', 'unit', 'machine', 'vector', 'computer', 'feature']

Topic #2:
['neuron', 'cell', 'response', 'stimulus', 'activity', 'pattern', 'spike', 'unit', 'signal', 'synaptic', 'neural', 'cortical', 'effect', 'connection', 'visual', 'frequency', 'firing', 'layer', 'et_al', 'cortex']

Topic #3:
['unit', 'training', 'pattern', 'hidden_unit', 'rule', 'node', 'net', 'task', 'layer', 'representation', 'trained', 'architecture', 'word', 'activation', 'sequence', 'training_set', 'structure', 'level', 'character', 'connectionist']

Topic #4:
['image', 'object', 'feature', 'visual', 'filter', 'representation', 'pixel', 'signal', 'face', 'motion', 'view', 'location', 'region', 'local', 'source', 'position', 'field', 'scale', 'vector', 'shape']

Topic #5:
['distribution', 'class', 'probability'

In [68]:
# use perplexity and coherence scores as measures to evaluate topic model
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, 
                                                      texts=norm_corpus_bigrams, 
                                                      dictionary=dictionary, coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda.get_coherence()
umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, 
                                                         texts=norm_corpus_bigrams, 
                                                         dictionary=dictionary, coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda.get_coherence()

perplexity = lda_model.log_perplexity(bow_corpus)

print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

Avg. Coherence Score (Cv): 0.4886609386278148
Avg. Coherence Score (UMass): -1.0914622235131308
Model Perplexity: -7.787589872271927


In [69]:
## LDA Models with MALLET
# download MALLET framework
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

In [70]:
# extract contents from archive
# !unzip -q mallet-2.0.8.zip

In [94]:
MALLET_PATH = path_to_users + '/mallet-2.0.8/bin/mallet'
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=bow_corpus,
                                              num_topics=TOTAL_TOPICS, id2word=dictionary, 
                                              iterations=500, workers=16)

topics=[[(term, round(wt,3)) 
         for term, wt in lda_mallet.show_topic(n, topn=20)]
             for n in range(0, TOTAL_TOPICS)]

for idx, topic in enumerate(topics):
    print('Topic #'+str(idx+1)+':')
    print([term for term, wt in topic])
    print()

Topic #1:
['unit', 'layer', 'node', 'rule', 'hidden_unit', 'pattern', 'net', 'architecture', 'training', 'activation', 'representation', 'task', 'structure', 'recurrent', 'sequence', 'trained', 'module', 'connection', 'back_propagation', 'connectionist']

Topic #2:
['vector', 'class', 'bound', 'linear', 'theorem', 'matrix', 'size', 'defined', 'condition', 'approximation', 'theory', 'xi', 'complexity', 'threshold', 'constant', 'proof', 'assume', 'property', 'loss', 'polynomial']

Topic #3:
['state', 'control', 'action', 'step', 'trajectory', 'task', 'controller', 'environment', 'policy', 'optimal', 'path', 'transition', 'goal', 'dynamic', 'reinforcement_learning', 'search', 'trial', 'change', 'robot', 'learned']

Topic #4:
['response', 'cell', 'stimulus', 'visual', 'motion', 'signal', 'filter', 'direction', 'receptive_field', 'map', 'spatial', 'target', 'activity', 'eye', 'unit', 'field', 'orientation', 'subject', 'velocity', 'location']

Topic #5:
['equation', 'noise', 'solution', 'vec

In [95]:
# evaluate model using perplexity and coherence metrics
cv_coherence_model_lda_mallet = gensim.models.CoherenceModel(model=lda_mallet, corpus=bow_corpus, 
                                                             texts=norm_corpus_bigrams, 
                                                             dictionary=dictionary, 
                                                             coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda_mallet.get_coherence()

umass_coherence_model_lda_mallet = gensim.models.CoherenceModel(model=lda_mallet, 
                                                                corpus=bow_corpus, 
                                                                texts=norm_corpus_bigrams, 
                                                                dictionary=dictionary, 
                                                                coherence='u_mass')

avg_coherence_umass = umass_coherence_model_lda_mallet.get_coherence()

# from STDOUT: <500> LL/token: -8.53533
perplexity = -8.53533
print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

Avg. Coherence Score (Cv): 0.5240900426563829
Avg. Coherence Score (UMass): -1.0462128724374544
Model Perplexity: -8.53533


In [96]:
## LDA Tuning: Finding the Optimal Number of Topics
from tqdm import tqdm

# iterate and build several models with differing number of topics
# select one that has highest coherence score
def topic_model_coherence_generator(corpus, texts, dictionary, start_topic_count=2, 
                                    end_topic_count=10, step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        mallet_lda_model = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, 
                                                            corpus=corpus, num_topics=topic_nums, 
                                                            id2word=dictionary, iterations=500, 
                                                            workers=cpus)
        cv_coherence_model_mallet_lda = gensim.models.CoherenceModel(model=mallet_lda_model, 
                                                                     corpus=corpus, texts=texts, 
                                                                     dictionary=dictionary, 
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(mallet_lda_model)
    
    return models, coherence_scores

#lda_models, coherence_scores = topic_model_coherence_generator(corpus=bow_corpus, 
#                                                               texts=norm_corpus_bigrams, 
#                                                               dictionary=dictionary, 
#                                                               start_topic_count=2, 
#                                                               end_topic_count=30, step=1, 
#                                                               cpus=16)

In [97]:
#import pickle
#import numpy

# save model for later use
#filename = 'lda_models.sav'
#pickle.dump(lda_models, open(filename, 'wb'))

# save coherence scores
#np.savetxt("coherence_scores.csv", coherence_scores, delimiter=",")#

In [106]:
# load model and scores
import pickle
import numpy

filename = 'models/lda_models.sav'
lda_models = pickle.load(open(filename, 'rb'))

coherence_scores = np.genfromtxt('data/coherence_scores.csv', delimiter=',')

FileNotFoundError: [Errno 2] No such file or directory: 'models/lda_models.sav'

In [None]:
coherence_df = pd.DataFrame({'Number of Topics': range(2, 31, 1), 
                             'Coherence Score': np.round(coherence_scores, 4)})
coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(10)

In [None]:
# plot graph showing number of topics per model and corresponding coherence scores
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(2,31,1)
y_ax = coherence_scores
plt.figure(figsize=(12,6))
plt.plot(x_ax, y_ax,c='r')
plt.axhline(y=0.535, c='k', linestyle='--', linewidth=2)
plt.rcParams['figure.facecolor'] = 'white'
x1 = plt.xlabel('Number of Topics')
y1 = plt.ylabel('Coherence Scores')

In [None]:
# based on graph, choose optimal number of topics as 20
# retrieve best model
best_model_idx = coherence_df[coherence_df['Number of Topics'] == 20].index[0]
best_lda_model = lda_models[best_model_idx]
best_lda_model.num_topics

In [None]:
# view all the 20 topics generated by selected best model
topics = [[(term, round(wt, 3)) 
           for term, wt in best_lda_model.show_topic(n, topn=20)] 
              for n in range(0, best_lda_model.num_topics)]

for idx, topic in enumerate(topics):
    print('Topic #'+str(idx+1)+':')
    print([term for term, wt in topic])
    print()

In [None]:
# build term topic dataframe
topics_df = pd.DataFrame([[term for term, wt in topic] 
                              for topic in topics], 
                         columns=['Term'+str(i) for i in range(1,21)], 
                         index=['Topic '+str(t) for t in range(1, best_lda_model.num_topics+1)]).T
topics_df

In [None]:
# create topic term dataframe: each topic represented in a row with terms of topic
# represented as comma-separated string
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) 
                              for topic in topics], 
                         columns=['Terms per Topic'], 
                         index=['Topic'+str(t) for t in range (1, best_lda_model.num_topics+1)])
topics_df

In [None]:
## Interpreting Topic Model Results
tm_results = best_lda_model[bow_corpus]

# get most dominant topic per research paper
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results]
corpus_topics[:5]

# construct master dataframe that holds base statistics
corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(papers))
corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Topic Desc'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]
corpus_topic_df['Paper'] = papers

In [None]:
## Dominant Topics Distribution Across Corpus
pd.set_option('display.max_colwidth', 200)
topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg(
    {'Dominant Topic': {'Doc Count': np.size, '% Total Docs': np.size }})

topic_stats_df = topic_stats_df['Dominant Topic'].reset_index()
topic_stats_df['% Total Docs'] = topic_stats_df['% Total Docs'].
apply(lambda row: round((row*100) / len(papers), 2))
topic_stats_df['Topic Desc'] = [topics_df.iloc[t]['Terms per Topic'] 
    for t in range(len(topic_stats_df))]
topic_stats_df

In [None]:
## Dominant Topics in Specific Research Papers
pd.set_option('display.max_colwidth', 200)
(corpus_topic_df[corpus_topic_df['Document'].
    isin([681, 9, 392, 1622, 17, 906, 996, 503, 13, 733])])

In [None]:
## Relevant Research Papers per Topic Based on Dominance
corpus_topic_df.groupby('Dominant Topic').apply(lambda topic_set:
    (topic_set.sort_values(by=['Contribution %'], ascending=False).iloc[0]))

In [None]:
## Predicting Topics for New Research Papers
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
print('Total New Papers', len(new_papers))

In [None]:
# build text wrangling and feature engineering pipeline
def text_preprocessing_pipeline(documents, normalizer_fn, bigram_model):
    norm_docs = normalizer_fn(documents)
    norm_docs_bigrams = bigram_model[norm_docs]
    return norm_docs_bigrams

def bow_features_pipeline(tokenized_docs, dictionary):
    paper_bow_features = [dictionary.doc2bow(text) 
        for text in tokenized_docs]
    return paper_bow_features

norm_new_papers = text_preprocessing_pipeline(documents=new_papers, normalizer_fn=normalize_corpus, bigram_model=bigram_model)

norm_bow_features = bow_features_pipeline(tokenized_docs=norm_new_papers, dictionary=dictionary)

print(norm_new_papers[0][:30])

In [None]:
print(norm_bow_features)

In [None]:
# build generic fuction to extract top N topics from any research paper using trained model
def get_topic_predictions(topic_model, corpus, topn=3):
    topic_predictions = topic_model[corpus]
    best_topics = [[(topic, round(wt,3)) 
                        for topic, wt in sorted(topic_predictions[i],
                        key=lamda row: -row[1])[:topn]]
                            for i in range(len(topic_predictions))]
    return best_topics

# putting the function in action
topic_preds = get_topic_predictions(topic_model=best_lda_model, corpus=norm_bow_features, topn=2)
topic_preds

In [None]:
# review results for each paper
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, wt in term] 
                                    for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in [[round(wt*100,2) for topic_num, wt in item] for item in topic_preds] for topic_wt in topic_list]
results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]
pd.set_option('display.max_colwidth', 300)
results_df

## Topic Models with Scikit-Learn

In [None]:
## Text Representation with Feature Engineering
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=20, max_df=0.6, ngram_range=(1,2), token_pattern=None, tokenizer=lambda doc: doc, preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

# validating vocabulary size
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size', len(vocabulary))

In [None]:
%%time
## Latent Semantic Indexing
from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS=20
lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_features)

In [None]:
topic_terms = lsi_model.components_
topic_terms.shape

In [None]:
# reuse previously implemented code to display topics and terms
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weight[n]
    term_weights = sorted([(t,w) for t, w in zip(terms, weights)],
        key = lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt,3)))
        else:
            print('Direction 1:', d1)
            print('-'*50)
            print('Direction 2:', d2)
            print('-'*50)
            print()


In [None]:
# extract key topics for specific research papers
dt_df = pd.DataFrame(np.round(document_topics,3), 
            columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])

document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number])
    print()

In [None]:
%%time
## Latent Dirichlet Allocation
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50, learning_method='online', match_size=1740, learning_offset=50., random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

In [None]:
# obtain topic-term matrix
# build dataframe from it to showcase topics and terms
topic_terms = lda_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics, column=['Terms per Topic'], index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

In [None]:
# view research papers having max contribution of each of the 20 topics
dt_df = pd.DataFrame(document_topics, columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
pd.options.display.float_format = '{:, .5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                        for t in dominant_topics]
documents = [papers[i] for i in document_numbers]
results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                            'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'],
                            'Paper Name': documents})
results_df

In [None]:
%%time
## Non-Negative Matrix Factorization
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=TOTAL_TOPICS, solver='cd', max_iter=500, random_state=42, alpha=.1, l1_ratio=.85)
document_topics = nmf_model.fit_transform(cv_features)

In [None]:
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics, columns=['Terms per Topic'],
                            index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

In [None]:
# determine dominance of topics in research papers by absolute scores
pd.options.display.float_format = '{:, .3f}'.format
dt_df = pd.DataFrame(document_topics, columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)

In [None]:
# determine most relevant paper for each topic based on topic dominance scores
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0]
                        for t in dominant_topics]
documents = [papers[i] for i in document_numbers]
results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                            'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'],
                            'Paper Name': documents})
results_df