In [None]:
import os, sys
import nltk
import sklearn
from lxml import etree

In [None]:
# define period boundaries
date_min = 1520
date_max = 1640

In [None]:
# search EEBO TCP texts for texts published between period markers
metadata=list()
   
eebo_root="~/eebo-tcp/eebo-tcp-A/"
all_text = list()

for text in os.listdir(eebo_root):
        path = eebo_root + "/" + text
        xml_object = etree.parse(path)
        pubdate = xml_object.findall(".//{http://www.tei-c.org/ns/1.0}date")[0]
        
        # extract date text
        pubdate = pubdate.xpath("text()")[0]
        
        # keep if four digits, convert to int
        if len(pubdate) == 4:
            try:
                pubdate = int(pubdate)
            except:
                continue
        else:
            continue
        
        if pubdate > date_min and pubdate < date_max:
            
            # check for language and keep only English
            if xml_object.findall(".//{http://www.tei-c.org/ns/1.0}language")[0].xpath("text()")[0] == "eng":
                
                # add to metadata
                metadata.append([pubdate,text])
            
                # extract text
                text_object = xml_object.findall('.//{http://www.tei-c.org/ns/1.0}text')
                text_object = text_object[0].xpath(".//text()")
                text_object=' '.join(text_object)
            
                # add to text archive
                all_text.append(text_object)

In [None]:
# now save text and metadata
import pickle
fp = open('eebo-tcp-a_engl_data.pkl','wb')
pickle.dump(metadata,fp)
pickle.dump(all_text,fp)

In [None]:
# now get ready for model
from nltk.tokenize import word_tokenize

eebo_documents = list()
for raw_text in all_text:
    tokens = word_tokenize(raw_text)
    
    # make lowercase 
    tokens = [word.lower() for word in tokens]
    
    # *step two* (default): remove non-alpha characters,
    # punctuation, and as many other "noise" elements as
    # possible. If dealing with a single character word,    
    # drop non-alphabetical characters. This will remove 
    # most punctuation but preserve many words containing
    # marks such as the '-' in 'self-emancipated'

    tmp_text=list()
    for word in tokens:
        if len(word) == 1:
            if word.isalpha == True:
                tmp_text.append(word)
        else:
            tmp_text.append(word)           
    tokens = tmp_text

    # now remove leading and trailing quotation marks,      
    # hyphens and  dashes
    tmp_text=list()
    drop_list = ['“','"','”','-','—']
    for word in tokens:
        if word[0] in drop_list:
            word = word[1:]
        if word[-1:] in drop_list:
            word = word[:-1]

        # catch any zero-length words remaining
        if len(word) > 0:
            tmp_text.append(word)
    tokens = tmp_text
    
    eebo_documents.append(tokens)

In [None]:
# save a little memory
del all_text

In [None]:
import gensim

# source documents
# dimension of feature vectors 
# max distance   
# number of times a word must appear to be included in vocab
# for parallelization

eebo_model = gensim.models.Word2Vec(
    eebo_documents, 
    sg=1,           # sg=1 is use skip-gram, sg=0 is cbow 
    size=200,        
    window=10,     
    min_count=2,    
    workers=10,     
    iter=30)

In [None]:
# save model
output = open("eebo-vectors-english.w2v","wb")
gensim.models.Word2Vec.save_word2vec_format(eebo_model,output,binary=True)