In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import stop_words

from nltk.tokenize import sent_tokenize, word_tokenize

import os, sys
from glob import glob
import numpy as np
import gc 

In [27]:
def preprocess(raw_text):
    # tokenize
    tokens = word_tokenize(raw_text)    

    # drop to lowercase
    tokens = [word.lower() for word in tokens]
        
    # *step two* (default): remove non-alpha characters,
    # punctuation, and as many other "noise" elements as
    # possible. If dealing with a single character word,    
    # drop non-alphabetical characters. This will remove 
    # most punctuation but preserve many words containing
    # marks such as the '-' in 'self-emancipated'

    tmp_text=list()

    for word in tokens:
        if len(word) == 1:
            if word.isalpha == True:
                tmp_text.append(word)
        else:
             tmp_text.append(word)           
    tokens = tmp_text

    # now remove leading and trailing quotation marks,      
    # hyphens and  dashes
    tmp_text=list()
    drop_list = ['“','"','”','-','—']
    for i, word in enumerate(tokens):
        if word[0] in drop_list:
            word = word[1:]
        if word[-1:] in drop_list:
            word = word[:-1]
        
        word = word.replace("gyftis", "gifts")
        word = word.replace("gether", "gather")
        word = word.replace("spirituall", "spiritual")
        word = word.replace("feythfull", "faith")
        word = word.replace("wytnes", "witness")
        word = word.replace("almes", "alms")
        word = word.replace("desyre", "desire")
        word = word.replace("selfe", "self")
        word = word.replace("saffely", "safely")
        word = word.replace("realme", "realm")
        word = word.replace("acte", "act")
        word = word.replace("fourme", "form")
        word = word.replace("subiectes", "subjects")
        word = word.replace("theyr", "their")
        word = word.replace("kynde", "kind")
        word = word.replace("kynge", "king")
        word = word.replace("kyndes", "kinds")
        word = word.replace("vpon", "unto")
        word = word.replace("purueyours", "purveyors")
        word = word.replace("highnes", "highness")
        word = word.replace("euery", "every")
        word = word.replace("quene", "queen")
        word = word.replace("quenes", "queens")
        word = word.replace("whiche", "which")
        word = word.replace("bloude", "blood")
        word = word.replace("soueraine", "sovereign")
        
        if word.isdigit():
            word = ""
        
        # catch any zero-length words remaining
        if len(word) > 0:
            tmp_text.append(word)
        
    return(tmp_text)

In [28]:
import gzip

ccp_models = dict()

input_texts = ["../texts/henry/henry.tar.gz",
              "../texts/edward/edward.tar.gz",
              "../texts/mary/mary.tar.gz",
              "../texts/elizabeth/elizabeth.tar.gz",
              "../texts/james/james.tar.gz",
              "../texts/charles/charles.tar.gz"]

# setup vectorizer and process text

for fp in input_texts:
    
    model_name = os.path.basename(fp).split(".")[0]
    print("starting: {0}".format(model_name))

    print("loading gzipped texts...")
    raw_text = gzip.open(fp,'rt').read()
    
    print("preprocessing...")
    tokens = preprocess(raw_text)

    # simulate documents
    print("segmenting...")
    collection = list()
    segment_length = int(len(tokens)/1000)
    
    for j in range(1000):
        segment = tokens[segment_length*j:segment_length*(j+1)]
        collection.append(' '.join(segment))
        
    # free up memory
    del raw_text
    gc.collect()
    
    vec = CountVectorizer(input='content',
                      min_df=2,
                      stop_words = [",","the","and","of","or","to","in","shall","be","that","any","by",".",
              "such","as","this","for","same","all","said","other","'s",";",
              "her","is","every","[","]","they","within", "our", "not", "so",
              "made", "no", "then", ":", "do", "from", "if", "it", "which", "at", "with",
             "thereof","upon", "a", "because", "used", "some", "but", "aforesaid", "also",
             ")","(", "what", "&", "may", "are", "their", "them", "sayde", "suche", "shalbe", "anye", "sayd",
             "thesaid", "/", "...", "/", "either"],
                      lowercase=True)

    # train model (LDA)
    counts = vec.fit_transform(collection)
    dc, vc = counts.shape
    print("read {0} documents with {1} vocabulary".format(dc,vc))
    
    # Build the LDA Model
    # n_components = number of topics to extract (if topics are too similar, extract more)
 
    ccp_model = LatentDirichletAllocation(n_components=2,
                                            max_iter=5,
                                            learning_method='batch',
                                            random_state=1)
    ccp_models[model_name] = [ccp_model, counts, vec]    

starting: henry
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 1713 vocabulary
starting: edward
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 6313 vocabulary
starting: mary
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 442 vocabulary
starting: elizabeth
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 5771 vocabulary
starting: james
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 9247 vocabulary
starting: charles
loading gzipped texts...
preprocessing...
segmenting...
read 1000 documents with 1509 vocabulary


In [30]:
for ccp_model_name in ccp_models:
    # get fitted data and transformed matrix
    ccp_data = ccp_models[ccp_model_name][0].fit(ccp_models[ccp_model_name][1])

    # extract the features to a simple list
    feature_names = ccp_models[ccp_model_name][2].get_feature_names()

    # how many words do we want to extract for each topic?
    n_words = 100
    
    print(ccp_model_name+":\n")
    # now produce topics
    for topic_idx, topic in enumerate(ccp_models[ccp_model_name][0].components_):
        print("Topic #%d:" % topic_idx)
        for i in topic.argsort()[:-n_words - 1:-1]:
            print("{0} ".format(feature_names[i]),end="")
        print("\n")

henry:

Topic #0:
he his king my have was you on should has him had these things book henry thomist us against when scripture will one let who words your would luther does defender most how god christ lies argument man were more than here we now great can himself says world time papacy church therefore me after its sacraments first grace very whole water out where own concerning set truth into being an see good there like royal over england place come believe lord matter saint mass ye those part once think sacred up worthy reader thomas write am why foolish unto 

Topic #1:
have he faith me will men who god christ has we can say nothing thomist church bread only even my there king one make therefore sacrament work right words was been word thing body must nor does should were against him these mass now scripture man henry since good an more lord use authority his paul than alone us when on scriptures articles prove let into here own those see could concerning how sacrifice saying way p