# Topic Models with Scikit-Learn
## subsection of _Text Summarization and Topic Models_

* Topic Models with Scikit-Learn
    1. Text Representation with Feature Engineering
    2. Latent Semantic Indexing
    3. Latent Dirichlet Allocation
    4. Non-Negative Matrix Factorization
    5. Predicting Topics for New Research Papers
    6. Visualizing Topic Models

## Text Representation with Feature Engineering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=20, max_df=0.6, ngram_range=(1,2), token_pattern=None, 
                     tokenizer=lambda doc: doc, preprocessor=lambda doc: doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

# validating vocabulary size
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size', len(vocabulary))

## Latent Semantic Indexing

In [None]:
%%time

from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS=20
lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_features)

In [None]:
topic_terms = lsi_model.components_
topic_terms.shape

In [None]:
# reuse previously implemented code to display topics and terms
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] 
                                  for row, columns in list(
                                      zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))

for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t,w) for t, w in zip(terms, weights)],
        key = lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt,3)))
        else:
            d2.append((term, round(wt, 3)))
        
    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

In [None]:
# extract key topics for specific research papers
dt_df = pd.DataFrame(np.round(document_topics,3), 
            columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])

document_numbers = [13, 250, 500]

for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

## Latent Dirichlet Allocation

In [None]:
%%time

from sklearn.decomposition import LatentDirichletAllocation

#lda_model = LatentDirichletAllocation(n_components=TOTAL_TOPICS, max_iter=500, 
                                      max_doc_update_iter=50, learning_method='online', 
                                      batch_size=1740, learning_offset=50., 
                                      random_state=42, n_jobs=16)
#document_topics = lda_model.fit_transform(cv_features)

In [None]:
import pickle

# location of file
filename = path_to_users + '/models/sklearn_lda_models.sav'

# save model for later use
# pickle.dump(lda_model, open(filename, 'wb'))

# load model and scores
lda_model = pickle.load(open(filename, 'rb'))

In [None]:
# obtain topic-term matrix
# build dataframe from it to showcase topics and terms
topic_terms = lda_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics, columns=['Terms per Topic'], 
                         index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

In [None]:
# view research papers having max contribution of each of the 20 topics
dt_df = pd.DataFrame(document_topics, 
                     columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0]
                        for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

# display using dataframe
results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 
                           'Contribution %': contrib_perc,
                           'Paper Num': document_numbers, 
                           'Topic': topics_df['Terms per Topic'],
                           'Paper Name': documents}
                         )
results_df

## Non-Negative Matrix Factorization

In [None]:
%%time

from sklearn.decomposition import NMF

nmf_model = NMF(n_components=TOTAL_TOPICS, solver='cd', max_iter=500, 
                random_state=42, alpha=.1, l1_ratio=.85)
document_topics = nmf_model.fit_transform(cv_features)

In [None]:
# view generated topics
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics, columns=['Terms per Topic'],
                            index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

In [None]:
# determine dominance of topics in research papers by absolute scores
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)

In [None]:
# determine most relevant paper for each topic based on topic dominance scores
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0]
                        for t in dominant_topics]
documents = [papers[i] for i in document_numbers]
results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                            'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'],
                            'Paper Name': documents})
results_df

## Predicting Topics for New Research Papers

In [None]:
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('test_data/nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)

print('Total New Papers:', len(new_papers))

In [None]:
# preprocess documents and extract features
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

In [None]:
# use NMF topic model to predict topics for new research papers 
topic_predictions = nmf_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3))
                   for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                           key=lambda row: -row[1])[:2]]
                      for i in range(len(topic_predictions))]
best_topics

In [None]:
# view results
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, sc in item]
                                    for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in [[round(sc*100, 2)
                                                             for topic_num, sc in item]
                                                                for item in best_topics]
                                         for topic_sc in topic_list]
results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic']
                           for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]
results_df

## Visualizing Topic Models

In [None]:
#import pyLDAvis
#import pyLDAvis.sklearn
#import dill
#import warnings

#warnings.filterwarnings('ignore')
#pyLDAvis.enable_notebook()

#pyLDAvis.sklearn.prepare(nmf_model, cv_features, cv, mds='mmds')