In [6]:
from __future__ import print_function
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill
# import import_ipynb
# from process_tokenize import my_tokenizer

In [7]:
# what are we interestd in
# set this up to read the correct set of files we are interested in

n_gram = '1gm' # options - 1gm , 2gm
stem_type = 'lemma' # options - lemma,snow

In [8]:
# set random state
RSEED = 0

### 1. OPEN - TERM FREQUENCY FILES 
i.e. output of CountVectorizer

In [9]:
tf_train = pd.read_csv('../data/vectors/cv_'+ n_gram +'_'+ stem_type +'_train.csv', compression = 'gzip')
tf_train = tf_train.drop(columns='Unnamed: 0')

tf_test = pd.read_csv('../data/vectors/cv_'+ n_gram +'_'+ stem_type +'_test.csv', compression = 'gzip')
tf_test = tf_test.drop(columns='Unnamed: 0')

In [10]:
tf_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0,0,2,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,3,0,0


In [11]:
# load the appropriate tf_vectorizer
tf_vectorizer = dill.load(open('../data/vectors/tf_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))

### 2. OPEN - TERM FREQUENCY INVERSE DOCUMENT FREQUENCY FILES
i.e output of TfidfVectorizer 

In [64]:
tfid_train = pd.read_csv('../data/vectors/tfid_'+ n_gram +'_'+ stem_type +'_train.csv',compression = 'gzip')
tfid_train = tfid_train.drop(columns='Unnamed: 0')

tfid_test = pd.read_csv('../data/vectors/tfid_'+ n_gram +'_'+ stem_type +'_test.csv', compression = 'gzip')
tfid_test = tfid_test.drop(columns='Unnamed: 0')

In [65]:
tfid_train.head()

Unnamed: 0,book_title,author_name,book_location,aaron,aback,abandon,abandoned,abandoning,abandonment,abasement,...,zealand,zealous,zealously,zenith,zephyr,zero,zest,zigzag,zone,zoological
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.0,0.0,0.004126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005299
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.0,0.0,0.006395,0.002774,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00448,0.0,0.0,0.0
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0.0,0.0,0.0,0.0,0.0,0.0,0.005872,...,0.0,0.0,0.0,0.0,0.0,0.00554,0.0,0.0,0.0,0.0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.0,0.0,0.0,0.013355,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03942,0.0,0.0


In [66]:
# load the appropriate tf_vectorizer
#tfid_vectorizer = pickle.load(open('../data/vectors/tfid_vectorizer_'+ stem_type +'_'+ n_gram +'.pkl', 'rb'))
tfid_vectorizer = dill.load(open('../data/vectors/tfid_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))

### 3. HELPER FUNCTIONS

In [17]:
#function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print (f'Topic: {topic_idx}')
        print (" , ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [56]:
#function to display books for a topic
def find_topic_books(df,number):
    topics = list(df.columns)[3:]
    #print(topics)
    for t in topics:
        temp = df.sort_values(by=t,ascending=False)
        print('\n------'+t+'-------') 
        print(temp.iloc[0:number+1,0:2])    

### 4. DIMENSIONALITY REDUCTION - TOPIC MODELLING
1. LDA
2. NMF
3. LSA / LSI ? - These are rather old and don't give good results


use the Term Frequency matrix as Term Frequency Inverse Document Frequency does not work well

In [19]:
# set how many topics you want
topic_number = 20

### A. LDA with TF
LatentDirichletAllocation with the Term Frequency Allocation
Use the Term Frequency matrix as Term Frequency Inverse Document Frequency does not work well
LDA can only use raw term counts for LDA because it is a probabilistic graphical model

In [20]:
# Run LDA 
lda_tf = LatentDirichletAllocation(learning_method= 'batch',n_components=topic_number, random_state=RSEED, batch_size=128,n_jobs=-1)
lda_tf.fit(tf_train.iloc[:,3:])

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [21]:
# transform vectors
vec_lda_train = lda_tf.transform(tf_train.iloc[:,3:])
vec_lda_test = lda_tf.transform(tf_test.iloc[:,3:])

In [22]:
dill.dump(lda_tf, open('../data/vectors/lda_'+ stem_type + '_' + n_gram,'wb'))

In [23]:
print('------PERPLEXITY------')
print(lda_tf.perplexity(tf_train.iloc[:,3:]))
print(lda_tf.perplexity(tf_test.iloc[:,3:]))

------PERPLEXITY------
4945.746276061271
5220.032446671448


In [49]:
# get the topics and words for the topics in lda
no_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda_tf, tf_feature_names, no_top_words)

Topic: 0
thou , thy , thee , king , sir , lord , ti , hath , lady , god
Topic: 1
sir , lord , lady , manner , person , father , letter , family , master , gentleman
Topic: 2
girl , don , answered , sat , horse , suddenly , replied , rose , strange , slowly
Topic: 3
sea , boat , tree , ship , wind , island , big , dog , sun , red
Topic: 4
poet , poem , art , nature , line , footnote , beauty , note , letter , poetry
Topic: 5
king , father , sir , prince , horse , english , master , town , england , lord
Topic: 6
sir , lady , gentleman , dear , cried , answered , madame , miss , table , window
Topic: 7
tom , dick , boy , don , sam , answered , cried , jim , yes , didn
Topic: 8
don , mr , miss , yes , sir , boy , didn , girl , ain , won
Topic: 9
girl , mother , boy , peter , don , cried , anne , story , tree , dear
Topic: 10
money , class , law , trade , business , price , land , american , value , public
Topic: 11
government , general , power , law , war , public , lord , act , person , 

In [30]:
# add back details
lda_train = pd.DataFrame(vec_lda_train, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
lda_train.insert(loc=0, column='book_location', value=tf_train['book_location'])
lda_train.insert(loc=0, column='author_name', value=tf_train['author_name'])
lda_train.insert(loc=0, column='book_title', value=tf_train['book_title'])
lda_train.to_csv('../data/vectors/lda_'+ n_gram+'_'+ stem_type +'_train.csv')
lda_train.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,6e-06,6e-06,6e-06,0.056614,0.042713,6e-06,0.164503,...,0.066676,6e-06,6e-06,6e-06,6e-06,0.074134,6e-06,0.079391,6e-06,0.179807
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,3e-06,0.021875,3e-06,3e-06,3e-06,3e-06,0.034393,...,0.004022,3e-06,3e-06,3e-06,0.004022,0.060004,0.172095,3e-06,3e-06,0.112089
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,4e-06,0.009116,0.076108,0.098339,0.008048,0.055905,0.199755,...,0.015974,0.008912,0.01084,0.040935,4e-06,0.105113,0.047032,4e-06,4e-06,0.131171
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,9e-06,9e-06,0.412892,0.16695,0.017638,9e-06,0.024758,...,9e-06,0.000145,0.051583,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,0.01209
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.002563,0.082916,0.020869,0.07187,9e-06,9e-06,0.440103,...,0.000159,0.013008,0.017302,0.005694,0.07054,0.033966,0.009803,0.012162,0.007731,0.107179


In [31]:
# add the book details and write to file for test
lda_test = pd.DataFrame(vec_lda_test, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
lda_test.insert(loc=0, column='book_location', value=tf_test['book_location'])
lda_test.insert(loc=0, column='author_name', value=tf_test['author_name'])
lda_test.insert(loc=0, column='book_title', value=tf_test['book_title'])
lda_test.to_csv('../data/vectors/lda_'+ n_gram+'_'+ stem_type +'_test.csv')
lda_test.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0.088418,1.2e-05,1.2e-05,1.2e-05,0.003382,0.000692,0.316283,...,1.2e-05,1.2e-05,1.2e-05,0.03818,1.2e-05,0.170649,0.129748,0.067013,1.2e-05,1.2e-05
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,3e-06,3e-06,3e-06,0.013034,3e-06,3e-06,0.00117,...,3e-06,3e-06,0.006404,0.000572,3e-06,3e-06,0.048132,3e-06,3e-06,3e-06
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,4e-06,4e-06,4e-06,4e-06,4e-06,0.15621,0.016494,...,0.046273,4e-06,4e-06,4e-06,4e-06,4e-06,0.0708,4e-06,4e-06,4e-06
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,2.5e-05,2.5e-05,2.5e-05,2.5e-05,2.5e-05,0.028644,2.5e-05,...,0.006689,0.048928,2.5e-05,2.5e-05,0.539049,0.350062,2.5e-05,2.5e-05,0.008489,2.5e-05
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0.004891,2e-06,0.097843,0.056868,0.000896,2e-06,0.071941,...,2e-06,2e-06,2e-06,2e-06,2e-06,2e-06,0.007421,2e-06,0.002691,0.453285


In [58]:
# find documents for topic
no_documents = 10
find_topic_books(lda_train,no_documents)


------topic_1-------
                       book_title                 author_name
180               Comic Tragedies           Louisa May Alcott
2140          The Last Tournament               Lord Tennyson
1321                Helen of Troy                 Andrew Lang
1594                Count Alarcos           Benjamin Disraeli
1707           Idylls of the King               Lord Tennyson
1915                      Marmion            Sir Walter Scott
1045                        Lamia                  John Keats
1203                Chamber Music                 James Joyce
524   Sonnets from the Portuguese  Elizabeth Barrett Browning
320                        Mosada        William Butler Yeats
668                     Cleopatra         Henry Rider Haggard

------topic_2-------
                    book_title       author_name
120      Trial of Duncan Terig  Sir Walter Scott
700   The Heart of Mid-Lothian  Sir Walter Scott
1311                   Rob Roy  Sir Walter Scott
776        Bride

In [20]:
# setup for visualization
vec_tf=csr_matrix(tf_train.iloc[:,3:].values)
pyLDAvis.sklearn.prepare(lda_tf,vec_tf,tf_vectorizer)

### B. NMF WITH TFID
NMF workd better with TFID

In [67]:
# Run NMF
nmf_tfid = NMF(n_components=topic_number, random_state=RSEED, max_iter=500)
nmf_tfid.fit(tfid_train.iloc[:,3:])

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=500,
  n_components=20, random_state=0, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [68]:
# transform vectors
vec_nmf_train = nmf_tfid.transform(tfid_train.iloc[:,3:])
vec_nmf_test = nmf_tfid.transform(tfid_test.iloc[:,3:])

In [69]:
dill.dump(nmf_tfid, open('../data/vectors/nmf_'+ stem_type + '_' + n_gram,'wb'))

In [70]:
no_top_words = 10
tfid_feature_names = tfid_vectorizer.get_feature_names()
display_topics(nmf_tfid, tfid_feature_names, no_top_words)

Topic: 0
answered , cried , madame , replied , monsieur , lip , girl , exclaimed , smile , continued
Topic: 1
thou , thy , thee , hath , ti , thine , hast , heaven , soul , art
Topic: 2
art , poet , public , character , literature , english , government , modern , society , author
Topic: 3
mr , husband , didn , couldn , wouldn , isn , won , doesn , wasn , girl
Topic: 4
sir , lord , gentleman , john , george , duke , harry , master , london , james
Topic: 5
s , wot , arter , ave , ginger , sam , bob , agin , ead , skipper
Topic: 6
captain , ship , boat , deck , island , vessel , board , crew , sail , mate
Topic: 7
dick , tom , sam , rover , baxter , dan , fred , cried , answered , cadet
Topic: 8
green , bird , hill , flower , sky , wood , mountain , sweet , blue , dream
Topic: 9
god , christ , lord , sin , jesus , soul , heaven , christian , holy , church
Topic: 10
specie , animal , darwin , plant , hypothesis , science , organic , natural , modification , bird
Topic: 11
officer , army 

In [71]:
# add back details
nmf_train = pd.DataFrame(vec_nmf_train, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
nmf_train.insert(loc=0, column='book_location', value=tfid_train['book_location'])
nmf_train.insert(loc=0, column='author_name', value=tfid_train['author_name'])
nmf_train.insert(loc=0, column='book_title', value=tfid_train['book_title'])
nmf_train.to_csv('../data/vectors/nmf_'+ n_gram+'_'+ stem_type +'_train.csv')
nmf_train.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,0.024754,0.0,0.030954,0.032591,0.081775,0.0,0.0,...,0.0,0.0,0.072496,0.0,0.043274,0.0,0.005502,0.0,0.053511,0.000891
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,0.010176,0.0,0.015237,0.047607,0.006928,0.00125,0.0,...,0.0,0.0,0.003213,0.0,0.0,0.0,0.241563,0.077373,0.007175,0.004169
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,0.044107,0.0,0.014085,0.0,0.0831,0.0,0.013311,...,0.00325,0.007783,0.030975,0.003627,0.005625,0.012087,0.013309,0.053859,0.0,0.019906
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,0.004341,0.0,0.0,0.022038,0.0,0.0,0.0,...,0.0,0.0,0.043628,0.0,0.00033,0.040634,0.0,0.0,0.0,0.0
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,0.06308,0.0,0.018698,0.000363,0.090716,0.001838,0.0102,...,0.025048,0.003055,0.004595,0.0,0.004226,0.008518,0.000547,0.056877,0.0,0.006891


In [72]:
# add back details
nmf_test = pd.DataFrame(vec_nmf_test, columns=['topic_'+ str(i)for i in range(1,topic_number+1)])
nmf_test.insert(loc=0, column='book_location', value=tfid_test['book_location'])
nmf_test.insert(loc=0, column='author_name', value=tfid_test['author_name'])
nmf_test.insert(loc=0, column='book_title', value=tfid_test['book_title'])
nmf_test.to_csv('../data/vectors/nmf_'+ n_gram+'_'+ stem_type +'_test.csv')
nmf_test.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,Great Catherine,George Bernard Shaw,../data/gutenberg/George Bernard Shaw___Great ...,0.013377,0.001834,0.010979,0.002782,0.006235,0.00032,0.011939,...,0.0,0.009849,0.001425,0.00596,0.000819,0.0,0.00374,0.011713,0.060561,0.000653
1,Short Cruises,William Wymark Jacobs,../data/gutenberg/William Wymark Jacobs___Shor...,0.011489,0.0,0.0,0.053352,0.003168,0.394411,0.006154,...,0.0,0.0,0.008906,0.0,0.013697,0.0,0.004149,0.0372,0.0,0.0
2,Richard Dare's Venture,Edward Stratemeyer,../data/gutenberg/Edward Stratemeyer___Richard...,0.008674,0.0,0.0,0.018326,0.047545,0.0,0.0,...,0.0,0.0,0.034548,0.03914,0.0,0.0,0.0,0.0,0.0,0.000247
3,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley,../data/gutenberg/Thomas Henry Huxley___Willia...,0.00379,0.00026,0.023789,0.0,0.0,0.0,0.002772,...,0.063213,0.005,0.0,0.006096,0.0,0.0,0.0,0.0,0.0,0.002766
4,Beyond,John Galsworthy,../data/gutenberg/John Galsworthy___Beyond.txt,0.09543,0.0,0.0,0.03468,0.0,0.0,0.0,...,0.0,0.0,0.049041,0.0,0.001582,0.0,0.00401,0.057867,0.0,0.0


In [73]:
# find documents for topic
no_documents = 10
find_topic_books(nmf_train,no_documents)


------topic_1-------
                       book_title                author_name
254                 The Missioner  Edward Phillips Oppenheim
550              The Great Secret  Edward Phillips Oppenheim
1037                  The Avenger  Edward Phillips Oppenheim
1854            The Yellow Crayon  Edward Phillips Oppenheim
2127       The Vanished Messenger  Edward Phillips Oppenheim
2179         Mysterious Mr. Sabin  Edward Phillips Oppenheim
984             The Master Mummer  Edward Phillips Oppenheim
564                      Berenice  Edward Phillips Oppenheim
1319                        Havoc  Edward Phillips Oppenheim
1672  The Trampling of the Lilies            Rafael Sabatini
791             Mr. Marx's Secret  Edward Phillips Oppenheim

------topic_2-------
                             book_title                 author_name
180                     Comic Tragedies           Louisa May Alcott
2158  The Poetical Works of John Milton                 John Milton
524         Sonnets 

In [74]:
# setup for visualization
vec_tfid=csr_matrix(tfid_train.iloc[:,3:].values)
pyLDAvis.sklearn.prepare(nmf_tfid,vec_tfid,tfid_vectorizer)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
