In [5]:
#!/usr/bin/env python
# coding: utf-8

import os, sys
from glob import glob

# core nltk
import nltk
from nltk.tokenize import word_tokenize

# gensim magic
import gensim
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [6]:
def preprocess(raw_text):
    # tokenize
    tokens = word_tokenize(raw_text)    

    # drop to lowercase
    tokens = [word.lower() for word in tokens]
        
    # *step two* (default): remove non-alpha characters,
    # punctuation, and as many other "noise" elements as
    # possible. If dealing with a single character word,    
    # drop non-alphabetical characters. This will remove 
    # most punctuation but preserve many words containing
    # marks such as the '-' in 'self-emancipated'

    tmp_text=list()

    for word in tokens:
        if len(word) == 1:
            if word.isalpha == True:
                tmp_text.append(word)
        else:
             tmp_text.append(word)           
    tokens = tmp_text

    # now remove leading and trailing quotation marks,      
    # hyphens and  dashes
    tmp_text=list()
    drop_list = ['“','"','”','-','—']
    for i, word in enumerate(tokens):
        if word[0] in drop_list:
            word = word[1:]
        if word[-1:] in drop_list:
            word = word[:-1]
        
        word = word.replace("gyftis", "gifts")
        word = word.replace("gether", "gather")
        word = word.replace("spirituall", "spiritual")
        word = word.replace("feythfull", "faith")
        word = word.replace("wytnes", "witness")
        word = word.replace("almes", "alms")
        word = word.replace("desyre", "desire")
        word = word.replace("selfe", "self")
        word = word.replace("saffely", "safely")
        word = word.replace("realme", "realm")
        word = word.replace("acte", "act")
        word = word.replace("fourme", "form")
        word = word.replace("subiectes", "subjects")
        word = word.replace("theyr", "their")
        word = word.replace("kynde", "kind")
        word = word.replace("kynge", "king")
        word = word.replace("kyndes", "kinds")
        word = word.replace("vpon", "unto")
        word = word.replace("purueyours", "purveyors")
        word = word.replace("highnes", "highness")
        word = word.replace("euery", "every")
        word = word.replace("quene", "queen")
        word = word.replace("quenes", "queens")
        word = word.replace("whiche", "which")
        word = word.replace("bloude", "blood")
        word = word.replace("soueraine", "sovereign")
        
        # catch any zero-length words remaining
        if len(word) > 0:
            tmp_text.append(word)
        
    return(tmp_text)

In [7]:
def train_vectors(sentences):
    # source documents
    # dimension of feature vectors 
    # max distance   
    # number of times a word must appear to be included in vocab
    # for parallelization

    print("starting training...")
    model = gensim.models.Word2Vec(
        sentences, 
        sg=0,           # sg=1 is use skip-gram, sg=0 is cbow 
        size=200,        
        window=15,     
        min_count=2,    # increase to limit vocab and find fewer rare words
        workers=10,     
        iter=10)
    return(model)

In [8]:
# begin constructing model
import gzip
import gc 

eebo_models = dict()
input_data = ["../texts/eebo/eebo-1520-1529.txt.gz",
              "../texts/eebo/eebo-1530-1539.txt.gz",
              "../texts/eebo/eebo-1540-1549.txt.gz",
              "../texts/eebo/eebo-1550-1559.txt.gz",
              "../texts/eebo/eebo-1560-1569.txt.gz",
              "../texts/eebo/eebo-1570-1579.txt.gz",
              "../texts/eebo/eebo-1580-1589.txt.gz",
              "../texts/eebo/eebo-1590-1599.txt.gz",
              "../texts/eebo/eebo-1600-1609.txt.gz",
              "../texts/eebo/eebo-1610-1619.txt.gz",
              "../texts/eebo/eebo-1620-1629.txt.gz",
              "../texts/eebo/eebo-1630-1639.txt.gz"]


#for fp in glob("../texts/eebo/eebo-*.gz"):
for fp in input_data:
    
    model_name = os.path.basename(fp).split(".")[0]
    print("starting: {0}".format(model_name))

    print("loading gzipped texts...")
    raw_text = gzip.open(fp,'rt').read()
    
    print("preprocessing...")
    tokens = preprocess(raw_text)

    # simulate documents
    print("segmenting...")
    sample_sentences = list()
    segment_length = int(len(tokens)/1000)
    
    for j in range(1000):
        segment = tokens[segment_length*j:segment_length*(j+1)]
        sample_sentences.append(segment)
        
    # free up memory
    del raw_text
    gc.collect()

    # train model
    eebo_models[model_name] = train_vectors(sample_sentences)
    
    # save model
    print("saving output")
    fp = open("../models/" + model_name + ".w2v",'wb')
    eebo_models[model_name].wv.save(fp)
    

starting: eebo-1520-1529
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1530-1539
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1540-1549
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1550-1559
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1560-1569
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1570-1579
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1580-1589
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1590-1599
loading gzipped texts...
preprocessing...
segmenting...
starting training...
saving output
starting: eebo-1600-1609
loading gzipped texts...
preprocessing...
segme