In [43]:
# Installs requirements in case they are missing
%pip install spacy gensim pyLDAvis
import sys
import os

if sys.platform.startswith('win'):
    os.system('python -m spacy download en_core_web_md')
elif sys.platform.startswith('darwin') or sys.platform.startswith('linux'):
    os.system('python3 -m spacy download en_core_web_md')

Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [34]:
import glob
import spacy
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim

nlp = spacy.load('en_core_web_md')

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.lemma_.isalpha():
            filtered_tokens.append(token.lemma_)
    return filtered_tokens

def create_lda_model(file_paths):
    documents = []
    for path_book_name in file_paths:
        with open(path_book_name, 'r') as file:
            documents.append(file.read())

    cleaned_texts = []
    for doc in documents:
        cleaned_texts.append(preprocess(doc))

    dictionary = corpora.Dictionary(cleaned_texts)
    corpus = [dictionary.doc2bow(text) for text in cleaned_texts]

    lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
    
    return lda_model, corpus, dictionary

# Create LDA model for Marlowe books
marlowe_files = glob.glob("books/Marlowe-reg/*.txt")
marlowe_lda_model, marlowe_corpus, marlowe_dictionary = create_lda_model(marlowe_files)

# Create LDA model for Shakespeare books
shakespeare_files = glob.glob("books/Shakespeare/*.txt")
shakespeare_lda_model, shakespeare_corpus, shakespeare_dictionary = create_lda_model(shakespeare_files)

# Prepare the visualizations
marlowe_vis = pyLDAvis.gensim.prepare(marlowe_lda_model, marlowe_corpus, marlowe_dictionary)
shakespeare_vis = pyLDAvis.gensim.prepare(shakespeare_lda_model, shakespeare_corpus, shakespeare_dictionary)



The Intertopic Distance Map is a visualization tool that is often used in topic modeling to show the relationships between different topics. It's created using a technique called multidimensional scaling (MDS).

In this map, each topic is represented as a bubble, and the distance between the bubbles represents the similarity between the topics. Topics that are closer together are more similar. The size of the bubble represents the prevalence of the topic in the corpus.

The map is created using multidimensional scaling, a technique that takes a high-dimensional representation of the data (in this case, the topics in the corpus) and reduces it to two dimensions in a way that preserves the relative distances between points as much as possible. This allows us to visualize the relationships between topics in a way that's easy to understand.

In the context of the `pyLDAvis` library, when you hover over a topic in the Intertopic Distance Map, the bar chart on the right updates to show the 30 most relevant terms for the selected topic. The red bars represent the frequency of a term within a given topic, and the blue bars represent a term's frequency across the entire corpus. This helps you understand what each topic is about.
Source: Py

In [35]:
# Display the visualizations
print("Marlowe Topics:")
pyLDAvis.display(marlowe_vis)


Marlowe Topics:


In [36]:
# Display the visualizations
print("Shakespeare Topics:")
pyLDAvis.display(shakespeare_vis)

Shakespeare Topics:
