In [1]:
import urllib.request
sonnetsUrl = "http://www.gutenberg.org/cache/epub/1041/pg1041.txt"
sonnetsString = urllib.request.urlopen(sonnetsUrl).read().decode()

In [2]:
import re, os
pythonfilteredSonnetsStart = sonnetsString.find("  I\r\n") # title of first sonnet
filteredSonnetsEnd = sonnetsString.find("End of Project Gutenberg's") # end of sonnets
filteredSonnetsString = sonnetsString[pythonfilteredSonnetsStart:filteredSonnetsEnd].rstrip()
sonnetsList = re.split("  [A-Z]+\r\n\r\n", filteredSonnetsString)
sonnetsPath = 'sonnets' # this subdirectory will be relative to the current notebook
if not os.path.exists(sonnetsPath):
    os.makedirs(sonnetsPath)
for index, sonnet in enumerate(sonnetsList): # loop through our list as enumeration to get index
    if len(sonnet.strip()) > 0: # make sure we have text, not empty after stripping out whitespace
        filename = str(index).zfill(3)+".txt" # create filename from index
        pathname = os.path.join(sonnetsPath, filename) # directory name and filenamee
        f = open(pathname, "w")
        f.write(sonnet.rstrip()) # write out our sonnet into the file
        f.close()

In [3]:
from nltk.corpus import PlaintextCorpusReader
sonnetsCorpus = PlaintextCorpusReader("sonnets", ".*\.txt")
print(len(sonnetsCorpus.fileids()))

154


In [4]:
import nltk

def get_lists_of_words(corpus, **kwargs): # the ** in front of kwargs does the magic of keyword arguments
    documents = [] # list of documents where each document is a list of words
    for fileid in corpus.fileids(): # go trough each file in our corpus
        
        # keep only words and convert them to lowercase
        words = [token.lower() for token in corpus.words(fileid) if any([c for c in token if c.isalpha()])]
        
        # look for "minLength" in our keyword arguments and if it's defined, filter our list
        if "minLen" in kwargs and kwargs["minLen"]: 
            words = [word for word in words if len(word) >= kwargs["minLen"]]
        
        # look for "stopwords" in our keyword arguments and if any are defined, filter our list
        if "stopwords" in kwargs and kwargs["stopwords"]: 
            words = [word for word in words if word not in kwargs["stopwords"]]

        # look for "pos" in our keyword arguments and if any are defined, filter our list
        if "pos" in kwargs and kwargs["pos"]: 
            tagged = nltk.pos_tag(words)
            words = [word for word, pos in tagged if pos in kwargs["pos"]]
        
        documents.append(words) # add our list of words
    
    return documents # return our list of documents

In [5]:
sonnetsStopwords = nltk.corpus.stopwords.words('english') # load the default stopword list
sonnetsStopwords += ["thee", "thou", "thy"] # append a few more obvious words
sonnetsWords = get_lists_of_words(sonnetsCorpus, stopwords=sonnetsStopwords, minLen=3)

# have a peek:
for i in range(0,2): # first two documents
    print("document", str(i), sonnetsWords[i][0:5])

document 0 ['fairest', 'creatures', 'desire', 'increase', 'thereby']
document 1 ['forty', 'winters', 'shall', 'besiege', 'brow']


In [6]:
from gensim import corpora, models

def get_lda_from_lists_of_words(lists_of_words, **kwargs):
    dictionary = corpora.Dictionary(lists_of_words) # this dictionary maps terms to integers
    corpus = [dictionary.doc2bow(text) for text in lists_of_words] # create a bag of words from each document
    tfidf = models.TfidfModel(corpus) # this models the significance of words by document
    corpus_tfidf = tfidf[corpus]
    kwargs["id2word"] = dictionary # set the dictionary
    return models.LdaModel(corpus_tfidf, **kwargs) # do the LDA topic modelling

In [7]:
sonnetsLda = get_lda_from_lists_of_words(sonnetsWords, num_topics=10, passes=20) # small corpus, so more passes
print(sonnetsLda)

LdaModel(num_terms=2911, num_topics=10, decay=0.5, chunksize=2000)


In [8]:
def print_top_terms(lda, num_terms=10):
    for i in range(0, lda.num_topics):
        terms = [term for val, term in lda.show_topic(i, num_terms)]
        print("Top 10 terms for topic #", str(i), ": ", ", ".join(terms))

In [9]:
print_top_terms(sonnetsLda)

TypeError: sequence item 0: expected str instance, numpy.float64 found

Ugh. I had some problems trying to install gensim, so maybe this is a problem from that? Ask for help, definitely. I guess I can see if I can proceed with this book for now, though. Note that I should refresh my experience with Mallet if I'm going to be doing topic modeling as well. 

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
G = nx.Graph()
G.add_edge("A", "X") # student A went to school X
G.add_edge("A", "Y") # student A went to school Y
G.add_edge("B", "X") # student B went to school X
G.add_edge("C", "Y") # student C went to school X
nx.draw(G)

In [None]:
pos = nx.spring_layout(G)
nx.draw_networkx_labels(G, pos, font_color='r') # font colour is "r" for red
nx.draw_networkx_edges(G, pos, alpha=0.1) # set the line alpha transparency to .1
plt.axis('off') # don't show the axes for this plot
plt.show()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

def graph_terms_to_topics(lda, num_terms=10):
    
    # create a new graph and size it
    G = nx.Graph()
    plt.figure(figsize=(10,10))

    # generate the edges
    for i in range(0, lda.num_topics):
        topicLabel = "topic "+str(i)
        terms = [term for val, term in lda.show_topic(i, num_terms)]
        for term in terms:
            G.add_edge(topicLabel, term)
    
    pos = nx.spring_layout(G) # positions for all nodes
    
    # we'll plot topic labels and terms labels separately to have different colours
    g = G.subgraph([topic for topic, _ in pos.items() if "topic " in topic])
    nx.draw_networkx_labels(g, pos,  font_color='r')
    g = G.subgraph([term for term, _ in pos.items() if "topic " not in term])
    nx.draw_networkx_labels(g, pos)
    
    # plot edges
    nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.1)

    plt.axis('off')
    plt.show()

graph_terms_to_topics(sonnetsLda)

Welp, and I guess there it goes. That's unfortunate. Get help re: this - probably an install or out-of-date problem. 

### Next Steps

Try the following tasks to see if you can refine the topics:

- experiment with arguments to get_lists_of_words()
    - minLength of words
    - stopwords
    - parts-of-speech arguments (remember that these are Treebank codes)
    - add an argument to the function (and try it) that determines if words are converted to lowercase
- experiment with arguments to get_lda_from_lists_of_words(), in other words to LdaModel()

Which tweaks seem to make the most difference?