#Mount Google Drive

In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Double-check drive folder is mounted to colab
!ls

drive  sample_data


#Import Modules

In [None]:
!pip install pyLDAvis

In [6]:
import pickle
import glob
import spacy
import numpy as np
import pandas as pd
import gensim
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
import matplotlib.pyplot as plt
from datetime import datetime

# Import data

In [25]:
# Get all path names
paths = ['/content/drive/My Drive/ECE692_NLP/data_chapters/after.txt', '/content/drive/My Drive/ECE692_NLP/data_chapters/circular.txt', 
         '/content/drive/My Drive/ECE692_NLP/data_chapters/jennie.txt', '/content/drive/My Drive/ECE692_NLP/data_chapters/man.txt', 
         '/content/drive/My Drive/ECE692_NLP/data_chapters/window.txt']

# Read in data
all_books = []
for i, path in enumerate(paths):
  lineList = [line.rstrip('\n') for line in open(path)]
  all_books.append(lineList)

# Create a list per book
# Each item in the list represents a chapter
after = all_books[0]
circular = all_books[1]
jennie = all_books[2]
man = all_books[3]
window = all_books[4]

print()
print(after[:2])


[' By the bequest of an elder brother, I was left enough money to see me through a small college in Ohio, and to secure me four years in a medical school in the East.  Why I chose medicine I hardly know. Possibly the career of a surgeon attracted the adventurous element in me.  Perhaps, coming of a family of doctors, I merely followed the line of least resistance.  It may be, indirectly but inevitably, that I might be on the yacht Ella on that terrible night of August 12, more than a year ago.  I got through somehow.  I played quarterback on the football team, and made some money coaching.  In summer I did whatever came to hand, from chartering a sail-boat at a summer resort and taking passengers, at so much a head, to checking up cucumbers in Indiana for a Western pickle house.  I was practically alone.  Commencement left me with a diploma, a new dress-suit, an out-of-date medical library, a box of surgical instruments of the same date as the books, and an incipient case of typhoid f

#Tokenize and clean data using gensim’s simple_preprocess()

Tokenize each sentence into a list of words, removing punctuations and unnecessary characters.

In [26]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

after_tokenized = list(sent_to_words(after))
circular_tokenized = list(sent_to_words(circular))
jennie_tokenized = list(sent_to_words(jennie))
window_tokenized = list(sent_to_words(window))
man_tokenized = list(sent_to_words(man))

print(after_tokenized[:2])

[['by', 'the', 'bequest', 'of', 'an', 'elder', 'brother', 'was', 'left', 'enough', 'money', 'to', 'see', 'me', 'through', 'small', 'college', 'in', 'ohio', 'and', 'to', 'secure', 'me', 'four', 'years', 'in', 'medical', 'school', 'in', 'the', 'east', 'why', 'chose', 'medicine', 'hardly', 'know', 'possibly', 'the', 'career', 'of', 'surgeon', 'attracted', 'the', 'adventurous', 'element', 'in', 'me', 'perhaps', 'coming', 'of', 'family', 'of', 'doctors', 'merely', 'followed', 'the', 'line', 'of', 'least', 'resistance', 'it', 'may', 'be', 'indirectly', 'but', 'inevitably', 'that', 'might', 'be', 'on', 'the', 'yacht', 'ella', 'on', 'that', 'terrible', 'night', 'of', 'august', 'more', 'than', 'year', 'ago', 'got', 'through', 'somehow', 'played', 'quarterback', 'on', 'the', 'football', 'team', 'and', 'made', 'some', 'money', 'coaching', 'in', 'summer', 'did', 'whatever', 'came', 'to', 'hand', 'from', 'chartering', 'sail', 'boat', 'at', 'summer', 'resort', 'and', 'taking', 'passengers', 'at', 's

#Lemmatization

Convert words to its root word.

In [27]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    lemmatized_texts = []
    for sent in texts:
      doc = nlp(" ".join(sent)) 
      lemmatized_texts.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    #print('Number of chapters: ', len(lemmatized_texts))
    return lemmatized_texts

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Lemmatize all books keeping only Noun, Adj, Verb, Adverb
after_lemmatized = lemmatization(after_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
circular_lemmatized = lemmatization(circular_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
jennie_lemmatized = lemmatization(jennie_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
window_lemmatized = lemmatization(window_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
man_lemmatized = lemmatization(man_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


all_books_lemmatized = [after_lemmatized, circular_lemmatized, jennie_lemmatized, window_lemmatized, man_lemmatized]

print(after_lemmatized[:2])

['bequest eld brother leave enough money see small secure year medical school east why choose medicine hardly know possibly career surgeon attract adventurous element perhaps come family doctor merely follow line least resistance may indirectly inevitably may yacht ella terrible night more year ago get somehow play quarterback football team make money coach summer come hand charter sail boat summer resort take passenger so much head check cucumber western practically alone commencement leave suit out date medical library surgical instrument same date book incipient case typhoid fever foot tall forty inch chest also live clean work play hard get fever finally pretty much bone appetite alive thank college hospital care cost good thing just dollar world yacht ella lie river far hospital window yacht when first see time technically use word broad sense pleasure boat master when see first dirty disreputable most coast vessel rejuvenation history convalescence day stand first coat white pain

In [28]:
# Create one list with all chapters of all books
all_chapters_processed = []
for book in all_books_lemmatized:
  for chapter in book:
    all_chapters_processed.append(chapter)

print('Total number of chapters/documents (corpus size): ', len(all_chapters_processed))

Total number of chapters/documents (corpus size):  131


#Create document-word matrix

The LDA model requires a document-word matrix as input. The CountVectorizer is used.

In [29]:
vectorizer = CountVectorizer(lowercase=True,                   # convert words to lowercase
                             stop_words='english',             # remove stop words
                             analyzer='word',                  # feature should be made of word n-gram
                             min_df=1,                         # Filter those terms that appear in < 1 document
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                            )

X = vectorizer.fit_transform(all_chapters_processed)  # learn the vocabulary dictionary and return document-term matrix
feature_names = vectorizer.get_feature_names()  # list of feature names

print('vocabulary size: ', len(feature_names))
print('shape of document-word matrix: ', X.shape)

vocabulary size:  8415
shape of document-word matrix:  (131, 8415)


#GridSearch to find optimal # of topics

In [None]:
# Define search parameters
search_parameters = {'n_components': [5, 10, 15]}

# Initialize model and grid search 
lda = LatentDirichletAllocation(batch_size=16, max_iter=10, random_state=0, learning_method='online',
                                n_jobs=-1)
clf = GridSearchCV(lda, param_grid=search_parameters, verbose=3)

# Perform grid search
grid_result = clf.fit(X)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_components=5 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................ n_components=5, score=-169667.240, total=   2.8s
[CV] n_components=5 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] ................ n_components=5, score=-153260.103, total=   1.2s
[CV] n_components=5 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s


[CV] ................ n_components=5, score=-169023.487, total=   1.2s
[CV] n_components=5 ..................................................
[CV] ................ n_components=5, score=-197332.510, total=   1.2s
[CV] n_components=5 ..................................................
[CV] ................ n_components=5, score=-167885.458, total=   1.2s
[CV] n_components=10 .................................................
[CV] ............... n_components=10, score=-176407.351, total=   1.9s
[CV] n_components=10 .................................................
[CV] ............... n_components=10, score=-159459.375, total=   1.7s
[CV] n_components=10 .................................................
[CV] ............... n_components=10, score=-176290.482, total=   1.7s
[CV] n_components=10 .................................................
[CV] ............... n_components=10, score=-205449.338, total=   1.7s
[CV] n_components=10 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   27.3s finished


In [None]:
# Best model
best_lda_model = grid_result.best_estimator_

# Parameters of best model
print("Optimal model parameters: ", grid_result.best_params_)

# Log likelihood score (the higher, the better)
print("Best log likelihood score: ", grid_result.best_score_) 

# Perplexity (the lower, the better)
print("Model perplexity: ", best_lda_model.perplexity(X))

Optimal model parameters:  {'n_components': 5}
Best log likelihood score:  -171433.75967305974
Model perplexity:  2059.4080153870905


#Build LDA model

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,            # number of topics
                                      learning_method='online',   
                                      max_iter=10,               # max learning iterations, default=10
                                      random_state=0,            # random state
                                      batch_size=16,            
                                      evaluate_every = 1,        # compute perplexity every iteration
                                      n_jobs = -1 ,              # use all available CPUs
                                      verbose = 1
                                      )
print(lda_model)  # Model attributes
lda_output = lda_model.fit_transform(X)

LatentDirichletAllocation(batch_size=16, doc_topic_prior=None, evaluate_every=1,
                          learning_decay=0.7, learning_method='online',
                          learning_offset=10.0, max_doc_update_iter=100,
                          max_iter=10, mean_change_tol=0.001, n_components=5,
                          n_jobs=-1, perp_tol=0.1, random_state=0,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=1)
iteration: 1 of max_iter: 10, perplexity: 2323.7664
iteration: 2 of max_iter: 10, perplexity: 2181.9956
iteration: 3 of max_iter: 10, perplexity: 2129.7938
iteration: 4 of max_iter: 10, perplexity: 2105.8742
iteration: 5 of max_iter: 10, perplexity: 2092.1078
iteration: 6 of max_iter: 10, perplexity: 2082.0312
iteration: 7 of max_iter: 10, perplexity: 2074.0596
iteration: 8 of max_iter: 10, perplexity: 2067.9186
iteration: 9 of max_iter: 10, perplexity: 2063.1785
iteration: 10 of max_iter: 10, perplexity: 2059.40

#Display topics

In [None]:
# Show top n words for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10)        

# Topic - Words dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,say,man,look,burn,deck,watch,key,door,hand,stand
Topic 1,sand,sea,granger,dolly,letitia,molasse,sailor,mcwhirter,youth,vessel
Topic 2,say,come,know,man,look,think,door,room,night,time
Topic 3,deck,captain,cabin,crew,burn,ship,mate,vail,axe,forecastle
Topic 4,porter,berth,sail,rain,roan,hotchkiss,ship,bronze,sea,mcwhirter


In [None]:
## Books related to words from topics based on background research
# The After House: sailor, mcwhirter, vessel, captain, cabin, crew, burn, ship, mate, vail, axe, forecastle, sea, key, deck 
# The Circular Staircase: burn, key
# The Case of Jennie Brice: rain, key
# The Window at the White Cat: letitia, molasses, rain, key, watch
# The Man In Lower Ten: granger, dolly, burn, porter, berth, hotchkiss, bronze, sea, key

#Dominant topics per document

In [None]:
# Create document-word matrix
lda_output = lda_model.transform(X)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
doc_chapters = ['Chapter' + str(i) for i in range(X.shape[0])]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=doc_chapters)
#print(df_document_topic)

# Get dominant topic for each document/chapter
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get majority vote of dominant topic per book
b1 = df_document_topic[:25]['dominant_topic'].mode()
b2 = df_document_topic[25:59]['dominant_topic'].mode()
b3 = df_document_topic[59:75]['dominant_topic'].mode()
b4 = df_document_topic[75:100]['dominant_topic'].mode()
b5 = df_document_topic[100:130]['dominant_topic'].mode()

df_book_topic = pd.DataFrame([list(b1)[0], list(b2)[0], list(b3)[0], list(b4)[0], list(b5)[0]], columns=['Dominant Topic'], index=novel_ls)
display(df_book_topic)

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Dominant Topic
The After House,2
The Circular Staircase,2
The Case of Jennie Brice,2
The Window at the White Cat,2
The Man In Lower Ten,2


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Chapter0,0.0,0.0,0.59,0.01,0.39,2
Chapter1,0.0,0.22,0.69,0.09,0.0,2
Chapter2,0.0,0.0,0.79,0.21,0.0,2
Chapter3,0.0,0.0,0.81,0.13,0.06,2
Chapter4,0.0,0.0,0.8,0.2,0.0,2
Chapter5,0.0,0.0,0.83,0.17,0.0,2
Chapter6,0.0,0.0,0.76,0.23,0.0,2
Chapter7,0.0,0.02,0.89,0.09,0.0,2
Chapter8,0.0,0.0,0.87,0.13,0.0,2
Chapter9,0.0,0.0,0.82,0.18,0.0,2
