In [2]:
#!/usr/bin/env python
# coding: utf-8

import os, sys
from glob import glob

# core nltk
import nltk
from nltk.tokenize import word_tokenize

# gensim magic
import gensim
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [3]:
def preprocess(raw_text):
    # tokenize
    tokens = word_tokenize(raw_text)    

    # drop to lowercase
    tokens = [word.lower() for word in tokens]
        
    # *step two* (default): remove non-alpha characters,
    # punctuation, and as many other "noise" elements as
    # possible. If dealing with a single character word,    
    # drop non-alphabetical characters. This will remove 
    # most punctuation but preserve many words containing
    # marks such as the '-' in 'self-emancipated'

    tmp_text=list()

    for word in tokens:
        if len(word) == 1:
            if word.isalpha == True:
                tmp_text.append(word)
        else:
             tmp_text.append(word)           
    tokens = tmp_text

    # now remove leading and trailing quotation marks,      
    # hyphens and  dashes
    tmp_text=list()
    drop_list = ['“','"','”','-','—']
    for i, word in enumerate(tokens):
        if word[0] in drop_list:
            word = word[1:]
        if word[-1:] in drop_list:
            word = word[:-1]
        
        word = word.replace("gyftis", "gifts")
        word = word.replace("gether", "gather")
        word = word.replace("spirituall", "spiritual")
        word = word.replace("feythfull", "faith")
        word = word.replace("wytnes", "witness")
        word = word.replace("almes", "alms")
        word = word.replace("desyre", "desire")
        word = word.replace("selfe", "self")
        word = word.replace("saffely", "safely")
        word = word.replace("realme", "realm")
        word = word.replace("acte", "act")
        word = word.replace("fourme", "form")
        word = word.replace("subiectes", "subjects")
        word = word.replace("theyr", "their")
        word = word.replace("kynde", "kind")
        word = word.replace("kynge", "king")
        word = word.replace("kyndes", "kinds")
        word = word.replace("vpon", "unto")
        word = word.replace("purueyours", "purveyors")
        word = word.replace("highnes", "highness")
        word = word.replace("euery", "every")
        word = word.replace("quene", "queen")
        word = word.replace("quenes", "queens")
        word = word.replace("whiche", "which")
        word = word.replace("bloude", "blood")
        word = word.replace("soueraine", "sovereign")
        
        # catch any zero-length words remaining
        if len(word) > 0:
            tmp_text.append(word)
        
    return(tmp_text)

In [4]:
def train_vectors(sentences):
    # source documents
    # dimension of feature vectors 
    # max distance   
    # number of times a word must appear to be included in vocab
    # for parallelization

    print("starting training...")
    model = gensim.models.Word2Vec(
        sentences, 
        sg=0,           # sg=1 is use skip-gram, sg=0 is cbow 
        size=200,        
        window=15,     
        min_count=2,    # increase to limit vocab and find fewer rare words
        workers=10,     
        iter=10)
    return(model)

In [5]:
# henry_VII 1486 - 1509
#  ----------------------
# henry_VII 1510 - 1547
# edward_VI 1548 - 1553
# mary_I 1554 - 1558
# elizabeth_I 1559 - 1603
# james_I - 1604 - 1625
# charles_I - 1626 - 1649
#  ----------------------
# oliver_cromwell - 1650 - 1660
# charles_II > 1660

In [6]:
from glob import glob

doc_crown = dict()
for x in sorted(glob("../texts/eebo/eebo-year*")):
    year = os.path.basename(x).split('.')[0]
    year = int(year.split('-')[2])
    if year > 1510 and year < 1547:
        doc_crown[x] = "henry_VII"
    if year > 1548 and year < 1553:
        doc_crown[x] = "edward_VI"   
    if year > 1554 and year < 1558:
        doc_crown[x] = "mary_I"   
    if year > 1559 and year < 1603:
        doc_crown[x] = "elizabeth_I"   
    if year > 1604 and year < 1625:
        doc_crown[x] = "james_I"         
    if year > 1626:
        doc_crown[x] = "charles_I"  

In [7]:
# begin constructing model
import gzip
import gc 

# create dict holder for models
eebo_models = dict()

# iterate through the sovereigns
for crown in set(doc_crown.values()):
    tokens = list()
    print("starting: {0}".format(crown))
    
    # extract all files for this crown
    for k, v in doc_crown.items():
        if v == crown:
            print("loading gzipped texts...")
            raw_text = gzip.open(k,'rt').read()
            
            print("preprocessing...")
            tokens = tokens + preprocess(raw_text)

            # free memory
            del raw_text
            gc.collect()

    # simulate documents
    print("segmenting...")
    sample_sentences = list()
    segment_length = int(len(tokens)/1000)
    
    for j in range(1000):
        segment = tokens[segment_length*j:segment_length*(j+1)]
        sample_sentences.append(segment)
        
    # free up memory
    del tokens
    gc.collect()

    # train model
    eebo_models[crown] = train_vectors(sample_sentences)
    
    # save model
    print("saving output")
    fp = open("../models/eebo-" + crown + ".w2v",'wb')
    eebo_models[crown].wv.save(fp)

starting: elizabeth_I
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzipped texts...
preprocessing...
loading gzip