In [38]:
from __future__ import print_function
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill
# import import_ipynb
# from process_tokenize import my_tokenizer

In [39]:
# what are we interestd in
# set this up to read the correct set of files we are interested in

n_gram = '1gm' # options - 1gm , 2gm
stem_type = 'lemma' # options - lemma,snow

In [40]:
# set random state
RSEED = 0

### 1. OPEN - TERM FREQUENCY FILES 
i.e. output of CountVectorizer

In [41]:
tf_train = pd.read_csv('../data/vectors/cv_'+ n_gram +'_'+ stem_type +'_train.csv', compression = 'gzip')
tf_train = tf_train.drop(columns='Unnamed: 0')

tf_test = pd.read_csv('../data/vectors/cv_'+ n_gram +'_'+ stem_type +'_test.csv', compression = 'gzip')
tf_test = tf_test.drop(columns='Unnamed: 0')

In [42]:
tf_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0,0,2,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,3,0,0


In [43]:
# load the appropriate tf_vectorizer
tf_vectorizer = dill.load(open('../data/vectors/tf_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))

### 2. OPEN - TERM FREQUENCY INVERSE DOCUMENT FREQUENCY FILES
i.e output of TfidfVectorizer 

In [9]:
tfid_train = pd.read_csv('../data/vectors/tfid_'+ n_gram +'_'+ stem_type +'_train.csv',compression = 'gzip')
tfid_train = tfid_train.drop(columns='Unnamed: 0')

tfid_test = pd.read_csv('../data/vectors/tfid_'+ n_gram +'_'+ stem_type +'_test.csv', compression = 'gzip')
tfid_test = tfid_test.drop(columns='Unnamed: 0')

In [10]:
tfid_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.0,0.0,0.004126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005299
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.0,0.0,0.006395,0.002774,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00448,0.0,0.0,0.0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0.0,0.0,0.0,0.0,0.0,0.0,0.005872,...,0.0,0.0,0.0,0.0,0.0,0.00554,0.0,0.0,0.0,0.0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.0,0.0,0.0,0.013355,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03942,0.0,0.0


In [11]:
# load the appropriate tf_vectorizer
#tfid_vectorizer = pickle.load(open('../data/vectors/tfid_vectorizer_'+ stem_type +'_'+ n_gram +'.pkl', 'rb'))
tfid_vectorizer = dill.load(open('../data/vectors/tfid_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))

### 3. HELPER FUNCTIONS

In [12]:
#function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print (f'Topic: {topic_idx}')
        print (" , ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [13]:
#function to display books for a topic
def find_topic_books(df,number):
    topics = list(df.columns)[3:]
    #print(topics)
    for t in topics:
        temp = df.sort_values(by=t,ascending=False)
        print('\n------'+t+'-------') 
        print(temp.iloc[0:number+1,0:2])    

### 4. DIMENSIONALITY REDUCTION - TOPIC MODELLING
1. LDA
2. NMF
3. LSA / LSI ? - These are rather old and don't give good results


use the Term Frequency matrix as Term Frequency Inverse Document Frequency does not work well

In [28]:
# set how many topics you want
topic_number = 15

### A. LDA with TF
LatentDirichletAllocation with the Term Frequency Allocation
Use the Term Frequency matrix as Term Frequency Inverse Document Frequency does not work well
LDA can only use raw term counts for LDA because it is a probabilistic graphical model

In [44]:
# Run LDA 
lda_tf = LatentDirichletAllocation(learning_method= 'batch',n_components=topic_number, random_state=RSEED, batch_size=128,n_jobs=-1)
lda_tf.fit(tf_train.iloc[:,3:])

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=15, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [45]:
# transform vectors
vec_lda_train = lda_tf.transform(tf_train.iloc[:,3:])
vec_lda_test = lda_tf.transform(tf_test.iloc[:,3:])

In [46]:
dill.dump(lda_tf, open('../data/vectors/lda_'+ stem_type + '_' + n_gram,'wb'))

In [47]:
print('------PERPLEXITY------')
print(lda_tf.perplexity(tf_train.iloc[:,3:]))
print(lda_tf.perplexity(tf_test.iloc[:,3:]))

------PERPLEXITY------
6417.809587077815
6914.7525078331455


In [54]:
# get the topics and words for the topics in lda
no_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda_tf, tf_feature_names, no_top_words)

Topic: 0
mr , george , duke , harry , mary , james , john , captain , charles , colonel
Topic: 1
captain , boat , deck , vessel , island , sail , mate , shore , sailor , crew
Topic: 2
peter , mr , didn , jimmy , joe , tommy , ain , alice , bob , couldn
Topic: 3
government , principle , nation , political , religion , trade , moral , united , parliament , liberty
Topic: 4
thou , thy , thee , poet , poem , footnote , christ , sin , greek , ti
Topic: 5
dick , tom , camp , jim , sam , trail , snow , gun , didn , rover
Topic: 6
specie , plant , colour , variety , male , surface , female , science , distinct , period
Topic: 7
american , literature , reader , literary , author , artist , poet , modern , novel , century
Topic: 8
smiled , didn , remarked , mr , glanced , isn , presently , grey , nodded , monsieur
Topic: 9
indian , island , lake , forest , native , savage , village , snow , canoe , shore
Topic: 10
thou , prince , thee , queen , thy , knight , sword , castle , nay , richard
Topic

In [55]:
# add back details
lda_train = pd.DataFrame(vec_lda_train, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
lda_train.insert(loc=0, column='book_location', value=tf_train['book_location'])
lda_train.insert(loc=0, column='author_name', value=tf_train['author_name'])
lda_train.insert(loc=0, column='book_title', value=tf_train['book_title'])
lda_train.to_csv('../data/vectors/lda_'+ n_gram+'_'+ stem_type +'_train.csv')
lda_train.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.024503,1.4e-05,1.4e-05,0.019456,0.004147,1.4e-05,1.4e-05,0.188634,0.199965,1.4e-05,1.4e-05,1.4e-05,1.4e-05,0.289327,0.273858
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.122082,7e-06,0.303533,0.012188,7e-06,7e-06,0.013001,0.066539,0.122667,7e-06,7e-06,7e-06,0.123463,0.236478,7e-06
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.201496,0.087346,0.029985,0.027774,1e-05,0.022922,0.002199,0.137414,0.341142,0.018526,0.009753,0.05202,1e-05,0.068418,0.000984
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,2e-05,2e-05,0.007362,2e-05,2e-05,0.281786,2e-05,2e-05,0.264266,0.152951,2e-05,2e-05,0.086513,0.206946,2e-05
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,2e-05,0.069924,0.03069,0.036002,2e-05,2e-05,0.059825,0.01863,0.122308,2e-05,0.031033,2e-05,0.398182,0.003914,0.229392


In [56]:
# add the book details and write to file for test
lda_test = pd.DataFrame(vec_lda_test, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
lda_test.insert(loc=0, column='book_location', value=tf_test['book_location'])
lda_test.insert(loc=0, column='author_name', value=tf_test['author_name'])
lda_test.insert(loc=0, column='book_title', value=tf_test['book_title'])
lda_test.to_csv('../data/vectors/lda_'+ n_gram+'_'+ stem_type +'_test.csv')
lda_test.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0.101817,2.4e-05,0.000863,2.4e-05,0.000999,2.4e-05,2.4e-05,0.215072,0.05515,2.4e-05,0.089095,0.102721,0.218641,0.130368,0.085156
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,0.01649,0.108463,0.229974,8e-06,8e-06,0.157654,8e-06,8e-06,0.020388,8e-06,8e-06,8e-06,0.092946,0.374023,8e-06
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,0.113744,0.067719,0.177472,0.00976,1e-05,0.093262,1e-05,1e-05,0.007516,1e-05,0.136102,0.016032,0.003753,0.37459,1e-05
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,5e-05,5e-05,5e-05,0.238633,0.009119,5e-05,0.678779,0.072914,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0.01105,4e-06,0.132279,4e-06,0.018823,4e-06,4e-06,0.015262,0.620572,4e-06,4e-06,4e-06,0.113574,0.056881,0.03153


In [57]:
# find documents for topic
no_documents = 10
find_topic_books(lda_train,no_documents)


------topic_1-------
                              book_title       author_name
1196               The Journal to Stella    Jonathan Swift
2011                           Lady Anna  Anthony Trollope
1716  Sir Harry Hotspur of Humblethwaite  Anthony Trollope
1470            Mr. Scarborough's Family  Anthony Trollope
682                      Is He Popenjoy?  Anthony Trollope
503          The Golden Lion of Granpere  Anthony Trollope
595                         Cousin Henry  Anthony Trollope
464                           Marion Fay  Anthony Trollope
120                Trial of Duncan Terig  Sir Walter Scott
289                 The American Senator  Anthony Trollope
66                   The Duke's Children  Anthony Trollope

------topic_2-------
                            book_title            author_name
1472                The Life of a Ship         R M Ballantyne
2153                  Jarwin and Cuffy         R M Ballantyne
123                     The Lighthouse         R M Ballantyne


In [52]:
# setup for visualization
vec_tf=csr_matrix(tf_train.iloc[:,3:].values)
pyLDAvis.sklearn.prepare(lda_tf,vec_tf,tf_vectorizer)

### B. NMF WITH TFID
NMF workd better with TFID

In [29]:
# Run NMF
nmf_tfid = NMF(n_components=topic_number, random_state=RSEED, max_iter=500)
nmf_tfid.fit(tfid_train.iloc[:,3:])

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=500,
  n_components=15, random_state=0, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [30]:
# transform vectors
vec_nmf_train = nmf_tfid.transform(tfid_train.iloc[:,3:])
vec_nmf_test = nmf_tfid.transform(tfid_test.iloc[:,3:])

In [58]:
dill.dump(nmf_tfid, open('../data/vectors/nmf_'+ stem_type + '_' + n_gram,'wb'))

In [32]:
no_top_words = 10
tfid_feature_names = tfid_vectorizer.get_feature_names()
display_topics(nmf_tfid, tfid_feature_names, no_top_words)

Topic: 0
girl , answered , cried , didn , big , replied , lip , slowly , road , smile
Topic: 1
thou , thy , thee , hath , ti , thine , hast , soul , sweet , heaven
Topic: 2
government , public , nation , war , political , principle , opinion , class , society , constitution
Topic: 3
mr , miss , girl , didn , husband , aunt , couldn , isn , wouldn , won
Topic: 4
sir , lord , gentleman , john , miss , george , harry , master , madame , london
Topic: 5
s , wot , arter , ave , ginger , sam , bob , agin , ead , skipper
Topic: 6
captain , ship , boat , deck , island , vessel , shore , sail , board , crew
Topic: 7
dick , tom , sam , rover , baxter , dan , fred , cried , answered , didn
Topic: 8
art , poet , poem , literature , literary , english , author , poetry , character , artist
Topic: 9
god , christ , lord , sin , jesus , soul , heaven , christian , church , holy
Topic: 10
specie , animal , plant , darwin , science , hypothesis , bird , natural , condition , organic
Topic: 11
officer , 

In [33]:
# add back details
nmf_train = pd.DataFrame(vec_nmf_train, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
nmf_train.insert(loc=0, column='book_location', value=tfid_train['book_location'])
nmf_train.insert(loc=0, column='author_name', value=tfid_train['author_name'])
nmf_train.insert(loc=0, column='book_title', value=tfid_train['book_title'])
nmf_train.to_csv('../data/vectors/nmf_'+ n_gram+'_'+ stem_type +'_train.csv')
nmf_train.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.049734,0.0,0.013313,0.031046,0.094106,4.2e-05,0.0,0.003954,0.033866,0.0,0.0,0.0,0.024112,0.016215,0.037175
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.002097,0.0,0.008095,0.07234,0.030825,0.0,0.0,0.0068,0.008982,0.002681,0.0,0.0,0.247304,4.7e-05,0.0
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.058647,0.0,0.011558,0.012376,0.108296,0.0,0.013953,0.003363,0.012292,0.002835,0.0,0.004707,0.015935,0.0,0.0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0.046908,0.000477,0.0,0.021095,0.0,0.0,0.003144,0.005293,0.0,0.002739,0.0,0.023262,0.0,0.0,0.008548
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.05483,0.001287,0.024175,0.011042,0.125689,0.0,0.008057,0.0,0.005426,0.005772,0.03009,0.0,0.0,0.0,0.005405


In [34]:
# add back details
nmf_test = pd.DataFrame(vec_nmf_test, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
nmf_test.insert(loc=0, column='book_location', value=tfid_test['book_location'])
nmf_test.insert(loc=0, column='author_name', value=tfid_test['author_name'])
nmf_test.insert(loc=0, column='book_title', value=tfid_test['book_title'])
nmf_test.to_csv('../data/vectors/nmf_'+ n_gram+'_'+ stem_type +'_test.csv')
nmf_test.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0.005446,0.0,0.002012,0.006462,0.022827,0.000574,0.010053,0.000253,0.017388,0.011574,0.0,0.008527,0.004229,0.036841,0.0
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,0.014138,0.0,0.0,0.061661,0.010012,0.397639,0.004839,0.008816,0.0,0.0,0.0,0.0,0.002227,0.0,0.009618
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,0.021555,0.0,0.0,0.018932,0.049229,0.0,0.001623,0.029185,0.0,0.0,0.0,0.0,0.005849,0.026485,0.0
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,0.0,0.0,0.015286,0.0,0.003093,0.0,0.001815,0.004591,0.009108,0.008289,0.06898,0.001762,0.0,0.00447,0.0
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0.153693,0.004038,0.0,0.046356,0.025651,0.0,0.0,0.0,0.006931,0.004107,0.0,0.0,0.0,0.0,0.021348


In [36]:
# find documents for topic
no_documents = 30
find_topic_books(nmf_train,no_documents)


------topic_1-------
                                       book_title                author_name
205                                 Women in Love               D H Lawrence
1991                                   Five Tales            John Galsworthy
674   The Country of the Blind, And Other Stories       Herbert George Wells
2189                              Tales of Unrest              Joseph Conrad
280                              Frontier Stories                 Bret Harte
2106                                  The Rainbow               D H Lawrence
1327                              The Dark Flower            John Galsworthy
69                 Villa Rubein and Other Stories            John Galsworthy
642                                   Novel Notes       Jerome Klapka Jerome
2023                    An Outcast of the Islands              Joseph Conrad
2340       The Door in the Wall And Other Stories       Herbert George Wells
1003                       In the Carquinez Woods     

In [37]:
# setup for visualization
vec_tfid=csr_matrix(tfid_train.iloc[:,3:].values)
pyLDAvis.sklearn.prepare(nmf_tfid,vec_tfid,tfid_vectorizer)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
