In [3]:
import gensim,logging
import os
import sys
from os import listdir,makedirs
from os.path import isfile, join,exists
import json
import spacy
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict,Counter
from nltk.corpus import stopwords
import string
import nltk
import numpy as np
import codecs
import random



In [15]:
# Full cofea dataset
cofea_dir = join('data',"cofea_full")
cofea_files = [f for f in listdir(cofea_dir) if isfile(join(cofea_dir, f)) and ".json" in f]
cofea_full = []
for file in cofea_files: # adjust index slice here for the desired source
    if file  == 'evans_output.json':
        with open(cofea_dir+"/"+file, encoding = 'utf-8') as f:
            data = json.load(f)
            cofea_full.append(data)

In [5]:
nlp = spacy.load("en_core_web_sm")
tokenizer = nlp.tokenizer
stop = stopwords.words('english')

In [16]:
# new split
# time_periods = [1640,1740,1840,2010]
time_range = [[1640,1720],[1730,1810]] # want to sample 3520 docs to 1261. Actually, lets do tokens want 21972686 tokens
file_loc = join(cofea_dir,'range_data')
# update to generate time_range from the time periods to avoid manual errors
save_name = 'evans_even_sample_phrase_'

In [17]:
# make this stuff work with sources too
docs = defaultdict(list)
for file in cofea_full:
    for doc in file:
        if 'decade' in doc.keys():
            for start, end in time_range:
                #print(doc['decade'])
                #print(doc['id'])
                if start <= int(doc['decade']) <= end:
                    docs[str(start)+'_'+str(end)].append(doc['body'])

In [8]:
def sample_data(documents,sample_size,sample_tokens=False):
    random.seed(42)
    if sample_tokens:
        # TO-DO shuffle the documents, then select only enough documents to cover the desired token count
        sample = documents
    else:
        # sample docs
        sample = random.sample(documents,k=sample_size)
    return sample

In [9]:
#sample the data sets as needed
docs['1730_1810'] = sample_data(docs['1730_1810'],1261)

In [18]:
phrases = ['necessary and proper','cruel and unusual','bear arms','natural born','public use']

In [19]:
def clean_doc(doc,p_check=False):
    # break documents up into sentences and make sure each token is separated by just one space
    doc = doc.strip()
    doc = re.sub('(\\n|\\t|\\s)+'," ",doc)
    if p_check:
        for p in phrases:
            psub = p.replace(' ','')
            doc = doc.replace(p,psub)
    sents = sent_tokenize(doc)
    sents = [ ' '.join([y.text for y in tokenizer(x)]).lower() for x in sents] 
    return sents

In [20]:
for range_name in docs: 
    # clean up the docs for processing
    sents = []
    for d in docs[range_name]:
        sents.extend(clean_doc(d,True))
    fname = save_name +range_name+'.txt' #change name based on source
    # Save to text file so it can be efficiently used by word2vec
    with open(join(file_loc,fname), 'w',encoding = 'utf-8') as filehandle:
        for s in sents:
            filehandle.write('%s\n' % s)

In [21]:
def run_and_save(fname):
    
    #train model
    model = gensim.models.Word2Vec( alpha=0.025, window=4,vector_size=300, min_count=5, workers=12, sg=1, hs=0, negative=5)
    model.build_vocab(gensim.models.word2vec.LineSentence(fname+'.txt'))
    model.train(gensim.models.word2vec.LineSentence(fname+'.txt'), total_examples=model.corpus_count, epochs=5)
    model.wv.save_word2vec_format(fname+ '.tmp')
    # save .wv.npy and .vocab
    vec = []
    w = codecs.open(fname + '.vocab', 'w', encoding='utf-8')
    vocab_size, embed_dim = None, None
    with codecs.open(fname + '.tmp', 'r', encoding='utf-8', errors='ignore') as r:
        for line in r:
            items = line.strip().split()
            if not vocab_size:
                assert(len(items) == 2)
                vocab_size, embed_dim = int(items[0]), int(items[1])
            else:
                assert(len(items) == embed_dim + 1)
                vec.append([float(item) for item in items[1:]])
                w.write('%s\n'%items[0])
    w.close()
    vec = np.array(vec, dtype=np.float)
    assert(vec.shape[0] == vocab_size)
    assert(vec.shape[1] == embed_dim)
    np.save(fname + '.wv.npy', vec)
    print('saved %s.wv.npy'%fname)
    print('saved %s.vocab'%fname)
    os.remove(fname + '.tmp')
    

In [22]:
# https://rare-technologies.com/word2vec-tutorial/
#sentences = MySentences('cofea_full') 
#model = gensim.models.Word2Vec(sentences)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
for start,end in time_range: # TODO :Update to use different split based on authors 
    range_name = str(start)+'_'+str(end)
    print(range_name)
    fname = save_name+range_name #change name based on source
    fname = join(file_loc,fname) 
    run_and_save(fname)

2021-09-02 12:22:21,686 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2021-09-02T12:22:21.685688', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2021-09-02 12:22:21,688 : INFO : collecting all words and their counts
2021-09-02 12:22:21,693 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-09-02 12:22:21,864 : INFO : PROGRESS: at sentence #10000, processed 252683 words, keeping 13822 word types


1640_1720


2021-09-02 12:22:21,958 : INFO : PROGRESS: at sentence #20000, processed 427197 words, keeping 19550 word types
2021-09-02 12:22:22,080 : INFO : PROGRESS: at sentence #30000, processed 673584 words, keeping 23991 word types
2021-09-02 12:22:22,149 : INFO : PROGRESS: at sentence #40000, processed 757365 words, keeping 25111 word types
2021-09-02 12:22:22,233 : INFO : PROGRESS: at sentence #50000, processed 908625 words, keeping 27263 word types
2021-09-02 12:22:22,369 : INFO : PROGRESS: at sentence #60000, processed 1222916 words, keeping 33955 word types
2021-09-02 12:22:22,496 : INFO : PROGRESS: at sentence #70000, processed 1518657 words, keeping 37269 word types
2021-09-02 12:22:22,618 : INFO : PROGRESS: at sentence #80000, processed 1817130 words, keeping 41204 word types
2021-09-02 12:22:22,744 : INFO : PROGRESS: at sentence #90000, processed 2093807 words, keeping 45206 word types
2021-09-02 12:22:22,908 : INFO : PROGRESS: at sentence #100000, processed 2526216 words, keeping 522

2021-09-02 12:22:30,273 : INFO : PROGRESS: at sentence #740000, processed 19649138 words, keeping 206722 word types
2021-09-02 12:22:30,420 : INFO : PROGRESS: at sentence #750000, processed 19997640 words, keeping 208799 word types
2021-09-02 12:22:30,561 : INFO : PROGRESS: at sentence #760000, processed 20347229 words, keeping 210637 word types
2021-09-02 12:22:30,683 : INFO : PROGRESS: at sentence #770000, processed 20635243 words, keeping 212303 word types
2021-09-02 12:22:30,806 : INFO : PROGRESS: at sentence #780000, processed 20932090 words, keeping 213909 word types
2021-09-02 12:22:30,930 : INFO : PROGRESS: at sentence #790000, processed 21234379 words, keeping 215891 word types
2021-09-02 12:22:31,032 : INFO : PROGRESS: at sentence #800000, processed 21451221 words, keeping 216900 word types
2021-09-02 12:22:31,115 : INFO : collected 217350 word types from a corpus of 21613202 raw words and 805687 sentences
2021-09-02 12:22:31,116 : INFO : Creating a fresh vocabulary
2021-09-0

2021-09-02 12:23:20,538 : INFO : EPOCH 1 - PROGRESS: at 73.97% examples, 217909 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:23:21,545 : INFO : EPOCH 1 - PROGRESS: at 75.37% examples, 218017 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:23:22,585 : INFO : EPOCH 1 - PROGRESS: at 76.88% examples, 218298 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:23:23,671 : INFO : EPOCH 1 - PROGRESS: at 78.28% examples, 218359 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:23:24,744 : INFO : EPOCH 1 - PROGRESS: at 79.91% examples, 218556 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:23:25,750 : INFO : EPOCH 1 - PROGRESS: at 82.11% examples, 218772 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:23:26,793 : INFO : EPOCH 1 - PROGRESS: at 84.23% examples, 219018 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:23:27,798 : INFO : EPOCH 1 - PROGRESS: at 85.95% examples, 219168 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:23:28,807 : INFO : EPOCH 1 - PROGRESS: at 87.62% examples, 219347 words/s,

2021-09-02 12:24:25,805 : INFO : EPOCH 2 - PROGRESS: at 72.00% examples, 215597 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:24:26,808 : INFO : EPOCH 2 - PROGRESS: at 73.43% examples, 215405 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:24:27,939 : INFO : EPOCH 2 - PROGRESS: at 74.98% examples, 215280 words/s, in_qsize 20, out_qsize 4
2021-09-02 12:24:28,940 : INFO : EPOCH 2 - PROGRESS: at 76.53% examples, 215782 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:24:29,960 : INFO : EPOCH 2 - PROGRESS: at 77.96% examples, 216184 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:24:31,018 : INFO : EPOCH 2 - PROGRESS: at 79.47% examples, 216528 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:24:32,023 : INFO : EPOCH 2 - PROGRESS: at 81.70% examples, 216896 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:24:33,099 : INFO : EPOCH 2 - PROGRESS: at 83.89% examples, 216952 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:24:34,197 : INFO : EPOCH 2 - PROGRESS: at 85.38% examples, 216332 words/s,

2021-09-02 12:25:31,340 : INFO : EPOCH 3 - PROGRESS: at 73.17% examples, 222340 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:25:32,411 : INFO : EPOCH 3 - PROGRESS: at 74.73% examples, 222343 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:25:33,463 : INFO : EPOCH 3 - PROGRESS: at 76.29% examples, 222475 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:25:34,486 : INFO : EPOCH 3 - PROGRESS: at 77.79% examples, 222876 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:25:35,487 : INFO : EPOCH 3 - PROGRESS: at 78.86% examples, 222434 words/s, in_qsize 22, out_qsize 0
2021-09-02 12:25:36,505 : INFO : EPOCH 3 - PROGRESS: at 80.72% examples, 222320 words/s, in_qsize 23, out_qsize 1
2021-09-02 12:25:37,541 : INFO : EPOCH 3 - PROGRESS: at 83.13% examples, 222682 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:25:38,545 : INFO : EPOCH 3 - PROGRESS: at 84.92% examples, 222775 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:25:39,608 : INFO : EPOCH 3 - PROGRESS: at 86.68% examples, 222813 words/s,

2021-09-02 12:26:36,198 : INFO : EPOCH 4 - PROGRESS: at 71.91% examples, 217394 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:37,235 : INFO : EPOCH 4 - PROGRESS: at 73.47% examples, 217436 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:38,270 : INFO : EPOCH 4 - PROGRESS: at 74.97% examples, 217550 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:26:39,319 : INFO : EPOCH 4 - PROGRESS: at 76.54% examples, 217803 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:40,322 : INFO : EPOCH 4 - PROGRESS: at 77.91% examples, 218116 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:41,352 : INFO : EPOCH 4 - PROGRESS: at 79.24% examples, 218158 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:42,355 : INFO : EPOCH 4 - PROGRESS: at 81.52% examples, 218634 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:26:43,419 : INFO : EPOCH 4 - PROGRESS: at 83.74% examples, 218700 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:26:44,428 : INFO : EPOCH 4 - PROGRESS: at 85.47% examples, 218958 words/s,

2021-09-02 12:27:40,728 : INFO : EPOCH 5 - PROGRESS: at 71.30% examples, 221601 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:27:41,744 : INFO : EPOCH 5 - PROGRESS: at 73.05% examples, 221789 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:27:42,761 : INFO : EPOCH 5 - PROGRESS: at 74.57% examples, 221801 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:27:43,836 : INFO : EPOCH 5 - PROGRESS: at 76.13% examples, 221813 words/s, in_qsize 21, out_qsize 2
2021-09-02 12:27:44,857 : INFO : EPOCH 5 - PROGRESS: at 77.45% examples, 221994 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:27:45,873 : INFO : EPOCH 5 - PROGRESS: at 78.63% examples, 221487 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:27:46,910 : INFO : EPOCH 5 - PROGRESS: at 79.96% examples, 220615 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:27:47,961 : INFO : EPOCH 5 - PROGRESS: at 82.17% examples, 220600 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:27:48,980 : INFO : EPOCH 5 - PROGRESS: at 84.18% examples, 220685 words/s,

saved data/cofea_full/range_data/evans_even_sample_phrase_1640_1720.wv.npy
saved data/cofea_full/range_data/evans_even_sample_phrase_1640_1720.vocab
1730_1810


2021-09-02 12:28:18,272 : INFO : PROGRESS: at sentence #20000, processed 599659 words, keeping 21724 word types
2021-09-02 12:28:18,379 : INFO : PROGRESS: at sentence #30000, processed 895671 words, keeping 27530 word types
2021-09-02 12:28:18,490 : INFO : PROGRESS: at sentence #40000, processed 1228642 words, keeping 32971 word types
2021-09-02 12:28:18,590 : INFO : PROGRESS: at sentence #50000, processed 1502378 words, keeping 36690 word types
2021-09-02 12:28:18,699 : INFO : PROGRESS: at sentence #60000, processed 1833711 words, keeping 45249 word types
2021-09-02 12:28:18,811 : INFO : PROGRESS: at sentence #70000, processed 2158556 words, keeping 47614 word types
2021-09-02 12:28:18,909 : INFO : PROGRESS: at sentence #80000, processed 2468825 words, keeping 50793 word types
2021-09-02 12:28:19,028 : INFO : PROGRESS: at sentence #90000, processed 2819609 words, keeping 54004 word types
2021-09-02 12:28:19,140 : INFO : PROGRESS: at sentence #100000, processed 3111894 words, keeping 5

2021-09-02 12:28:26,656 : INFO : PROGRESS: at sentence #740000, processed 24536391 words, keeping 198349 word types
2021-09-02 12:28:26,776 : INFO : PROGRESS: at sentence #750000, processed 24870777 words, keeping 200028 word types
2021-09-02 12:28:26,899 : INFO : PROGRESS: at sentence #760000, processed 25232166 words, keeping 201603 word types
2021-09-02 12:28:27,004 : INFO : PROGRESS: at sentence #770000, processed 25545757 words, keeping 202618 word types
2021-09-02 12:28:27,127 : INFO : PROGRESS: at sentence #780000, processed 25890026 words, keeping 203832 word types
2021-09-02 12:28:27,248 : INFO : PROGRESS: at sentence #790000, processed 26236214 words, keeping 205722 word types
2021-09-02 12:28:27,354 : INFO : PROGRESS: at sentence #800000, processed 26554522 words, keeping 207388 word types
2021-09-02 12:28:27,459 : INFO : PROGRESS: at sentence #810000, processed 26840821 words, keeping 208950 word types
2021-09-02 12:28:27,585 : INFO : PROGRESS: at sentence #820000, processe

2021-09-02 12:28:35,252 : INFO : PROGRESS: at sentence #1450000, processed 47696766 words, keeping 316200 word types
2021-09-02 12:28:35,411 : INFO : PROGRESS: at sentence #1460000, processed 48058159 words, keeping 318272 word types
2021-09-02 12:28:35,638 : INFO : PROGRESS: at sentence #1470000, processed 48395260 words, keeping 320471 word types
2021-09-02 12:28:35,882 : INFO : PROGRESS: at sentence #1480000, processed 48712917 words, keeping 321883 word types
2021-09-02 12:28:36,067 : INFO : PROGRESS: at sentence #1490000, processed 49094147 words, keeping 323397 word types
2021-09-02 12:28:36,195 : INFO : PROGRESS: at sentence #1500000, processed 49445104 words, keeping 324776 word types
2021-09-02 12:28:36,297 : INFO : PROGRESS: at sentence #1510000, processed 49713580 words, keeping 327100 word types
2021-09-02 12:28:36,451 : INFO : PROGRESS: at sentence #1520000, processed 50084670 words, keeping 328497 word types
2021-09-02 12:28:36,554 : INFO : PROGRESS: at sentence #1530000,

2021-09-02 12:28:43,808 : INFO : PROGRESS: at sentence #2160000, processed 69485016 words, keeping 426618 word types
2021-09-02 12:28:43,937 : INFO : PROGRESS: at sentence #2170000, processed 69761411 words, keeping 426966 word types
2021-09-02 12:28:44,079 : INFO : PROGRESS: at sentence #2180000, processed 70111775 words, keeping 428090 word types
2021-09-02 12:28:44,228 : INFO : PROGRESS: at sentence #2190000, processed 70394499 words, keeping 428953 word types
2021-09-02 12:28:44,304 : INFO : PROGRESS: at sentence #2200000, processed 70535870 words, keeping 429527 word types
2021-09-02 12:28:44,433 : INFO : PROGRESS: at sentence #2210000, processed 70900898 words, keeping 430516 word types
2021-09-02 12:28:44,570 : INFO : PROGRESS: at sentence #2220000, processed 71297200 words, keeping 432064 word types
2021-09-02 12:28:44,689 : INFO : PROGRESS: at sentence #2230000, processed 71607791 words, keeping 433319 word types
2021-09-02 12:28:44,819 : INFO : PROGRESS: at sentence #2240000,

2021-09-02 12:28:51,455 : INFO : PROGRESS: at sentence #2870000, processed 89521310 words, keeping 502658 word types
2021-09-02 12:28:51,568 : INFO : PROGRESS: at sentence #2880000, processed 89768552 words, keeping 503882 word types
2021-09-02 12:28:51,693 : INFO : PROGRESS: at sentence #2890000, processed 90092581 words, keeping 505017 word types
2021-09-02 12:28:51,785 : INFO : PROGRESS: at sentence #2900000, processed 90362342 words, keeping 505862 word types
2021-09-02 12:28:51,909 : INFO : PROGRESS: at sentence #2910000, processed 90709724 words, keeping 507081 word types
2021-09-02 12:28:52,021 : INFO : PROGRESS: at sentence #2920000, processed 91023179 words, keeping 507906 word types
2021-09-02 12:28:52,141 : INFO : PROGRESS: at sentence #2930000, processed 91335766 words, keeping 510071 word types
2021-09-02 12:28:52,273 : INFO : PROGRESS: at sentence #2940000, processed 91682072 words, keeping 511362 word types
2021-09-02 12:28:52,360 : INFO : PROGRESS: at sentence #2950000,

2021-09-02 12:29:35,209 : INFO : EPOCH 1 - PROGRESS: at 12.47% examples, 211793 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:29:36,244 : INFO : EPOCH 1 - PROGRESS: at 12.83% examples, 211740 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:29:37,269 : INFO : EPOCH 1 - PROGRESS: at 13.09% examples, 212080 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:29:38,312 : INFO : EPOCH 1 - PROGRESS: at 13.38% examples, 212104 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:29:39,318 : INFO : EPOCH 1 - PROGRESS: at 13.73% examples, 212024 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:29:40,335 : INFO : EPOCH 1 - PROGRESS: at 14.05% examples, 212151 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:29:41,347 : INFO : EPOCH 1 - PROGRESS: at 14.35% examples, 212369 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:29:42,360 : INFO : EPOCH 1 - PROGRESS: at 14.58% examples, 211946 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:29:43,453 : INFO : EPOCH 1 - PROGRESS: at 14.90% examples, 211996 words/s,

2021-09-02 12:30:50,098 : INFO : EPOCH 1 - PROGRESS: at 35.27% examples, 210625 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:30:51,138 : INFO : EPOCH 1 - PROGRESS: at 35.61% examples, 210795 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:30:52,154 : INFO : EPOCH 1 - PROGRESS: at 35.90% examples, 210674 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:30:53,267 : INFO : EPOCH 1 - PROGRESS: at 36.15% examples, 210522 words/s, in_qsize 21, out_qsize 2
2021-09-02 12:30:54,355 : INFO : EPOCH 1 - PROGRESS: at 36.48% examples, 210640 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:30:55,487 : INFO : EPOCH 1 - PROGRESS: at 36.84% examples, 210681 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:30:56,551 : INFO : EPOCH 1 - PROGRESS: at 37.23% examples, 210838 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:30:57,556 : INFO : EPOCH 1 - PROGRESS: at 37.56% examples, 210728 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:30:58,772 : INFO : EPOCH 1 - PROGRESS: at 37.98% examples, 210708 words/s,

2021-09-02 12:32:05,585 : INFO : EPOCH 1 - PROGRESS: at 59.81% examples, 209305 words/s, in_qsize 21, out_qsize 0
2021-09-02 12:32:06,674 : INFO : EPOCH 1 - PROGRESS: at 60.09% examples, 209133 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:32:07,702 : INFO : EPOCH 1 - PROGRESS: at 60.44% examples, 209273 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:32:08,742 : INFO : EPOCH 1 - PROGRESS: at 60.66% examples, 209171 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:32:09,747 : INFO : EPOCH 1 - PROGRESS: at 60.99% examples, 209279 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:32:10,801 : INFO : EPOCH 1 - PROGRESS: at 61.39% examples, 209310 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:32:11,813 : INFO : EPOCH 1 - PROGRESS: at 61.83% examples, 209316 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:32:12,905 : INFO : EPOCH 1 - PROGRESS: at 62.17% examples, 209315 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:32:13,936 : INFO : EPOCH 1 - PROGRESS: at 62.41% examples, 208987 words/s,

2021-09-02 12:33:21,076 : INFO : EPOCH 1 - PROGRESS: at 82.74% examples, 204114 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:33:22,089 : INFO : EPOCH 1 - PROGRESS: at 83.05% examples, 204128 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:33:23,167 : INFO : EPOCH 1 - PROGRESS: at 83.52% examples, 204161 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:33:24,229 : INFO : EPOCH 1 - PROGRESS: at 83.96% examples, 204227 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:33:25,259 : INFO : EPOCH 1 - PROGRESS: at 84.28% examples, 204279 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:33:26,276 : INFO : EPOCH 1 - PROGRESS: at 84.63% examples, 204312 words/s, in_qsize 23, out_qsize 1
2021-09-02 12:33:27,289 : INFO : EPOCH 1 - PROGRESS: at 85.17% examples, 204383 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:33:28,342 : INFO : EPOCH 1 - PROGRESS: at 85.50% examples, 204399 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:33:29,383 : INFO : EPOCH 1 - PROGRESS: at 85.80% examples, 204354 words/s,

2021-09-02 12:34:25,667 : INFO : EPOCH 2 - PROGRESS: at 4.78% examples, 182585 words/s, in_qsize 22, out_qsize 0
2021-09-02 12:34:26,686 : INFO : EPOCH 2 - PROGRESS: at 5.13% examples, 184169 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:34:27,762 : INFO : EPOCH 2 - PROGRESS: at 5.45% examples, 185542 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:34:28,783 : INFO : EPOCH 2 - PROGRESS: at 5.63% examples, 186415 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:34:29,812 : INFO : EPOCH 2 - PROGRESS: at 5.96% examples, 187424 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:34:30,812 : INFO : EPOCH 2 - PROGRESS: at 6.26% examples, 188864 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:34:31,895 : INFO : EPOCH 2 - PROGRESS: at 6.55% examples, 188575 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:34:32,938 : INFO : EPOCH 2 - PROGRESS: at 6.88% examples, 189709 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:34:33,987 : INFO : EPOCH 2 - PROGRESS: at 7.14% examples, 190035 words/s, in_qsize

2021-09-02 12:35:41,791 : INFO : EPOCH 2 - PROGRESS: at 26.01% examples, 191189 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:35:42,811 : INFO : EPOCH 2 - PROGRESS: at 26.28% examples, 191547 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:35:43,818 : INFO : EPOCH 2 - PROGRESS: at 26.62% examples, 191512 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:35:44,822 : INFO : EPOCH 2 - PROGRESS: at 27.00% examples, 191859 words/s, in_qsize 23, out_qsize 1
2021-09-02 12:35:45,836 : INFO : EPOCH 2 - PROGRESS: at 27.22% examples, 191809 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:35:46,955 : INFO : EPOCH 2 - PROGRESS: at 27.53% examples, 191951 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:35:48,067 : INFO : EPOCH 2 - PROGRESS: at 27.89% examples, 192267 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:35:49,096 : INFO : EPOCH 2 - PROGRESS: at 28.11% examples, 192479 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:35:50,114 : INFO : EPOCH 2 - PROGRESS: at 28.41% examples, 192563 words/s,

2021-09-02 12:36:56,355 : INFO : EPOCH 2 - PROGRESS: at 48.60% examples, 196685 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:36:57,363 : INFO : EPOCH 2 - PROGRESS: at 48.93% examples, 196787 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:36:58,369 : INFO : EPOCH 2 - PROGRESS: at 49.21% examples, 196773 words/s, in_qsize 21, out_qsize 2
2021-09-02 12:36:59,375 : INFO : EPOCH 2 - PROGRESS: at 49.49% examples, 196849 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:37:00,430 : INFO : EPOCH 2 - PROGRESS: at 49.83% examples, 196956 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:37:01,435 : INFO : EPOCH 2 - PROGRESS: at 50.18% examples, 197011 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:37:02,467 : INFO : EPOCH 2 - PROGRESS: at 50.50% examples, 197049 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:37:03,495 : INFO : EPOCH 2 - PROGRESS: at 50.81% examples, 197115 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:37:04,518 : INFO : EPOCH 2 - PROGRESS: at 51.08% examples, 197013 words/s,

2021-09-02 12:38:11,153 : INFO : EPOCH 2 - PROGRESS: at 73.15% examples, 199192 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:38:12,175 : INFO : EPOCH 2 - PROGRESS: at 73.37% examples, 199165 words/s, in_qsize 23, out_qsize 1
2021-09-02 12:38:13,180 : INFO : EPOCH 2 - PROGRESS: at 73.66% examples, 199256 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:38:14,185 : INFO : EPOCH 2 - PROGRESS: at 74.01% examples, 199349 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:38:15,221 : INFO : EPOCH 2 - PROGRESS: at 74.35% examples, 199338 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:38:16,256 : INFO : EPOCH 2 - PROGRESS: at 74.74% examples, 199381 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:38:17,271 : INFO : EPOCH 2 - PROGRESS: at 75.02% examples, 199326 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:38:18,335 : INFO : EPOCH 2 - PROGRESS: at 75.38% examples, 199414 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:38:19,341 : INFO : EPOCH 2 - PROGRESS: at 75.76% examples, 199491 words/s,

2021-09-02 12:39:26,028 : INFO : EPOCH 2 - PROGRESS: at 99.28% examples, 200659 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:39:27,063 : INFO : EPOCH 2 - PROGRESS: at 99.66% examples, 200713 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:39:27,868 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-09-02 12:39:27,874 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-09-02 12:39:27,883 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-09-02 12:39:27,956 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-09-02 12:39:28,001 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-09-02 12:39:28,025 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-09-02 12:39:28,054 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-09-02 12:39:28,074 : INFO : EPOCH 2 - PROGRESS: at 99.95% examples, 200750 words/s, in_qsize 4, out_qsize 1
2021-09-02 12:39:28,

2021-09-02 12:40:31,074 : INFO : EPOCH 3 - PROGRESS: at 17.97% examples, 199898 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:40:32,092 : INFO : EPOCH 3 - PROGRESS: at 18.33% examples, 200423 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:40:33,110 : INFO : EPOCH 3 - PROGRESS: at 18.62% examples, 200191 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:40:34,124 : INFO : EPOCH 3 - PROGRESS: at 19.15% examples, 200523 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:40:35,129 : INFO : EPOCH 3 - PROGRESS: at 19.36% examples, 200649 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:40:36,232 : INFO : EPOCH 3 - PROGRESS: at 19.67% examples, 200489 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:40:37,262 : INFO : EPOCH 3 - PROGRESS: at 20.01% examples, 200973 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:40:38,266 : INFO : EPOCH 3 - PROGRESS: at 20.30% examples, 200582 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:40:39,456 : INFO : EPOCH 3 - PROGRESS: at 20.62% examples, 200552 words/s,

2021-09-02 12:41:45,817 : INFO : EPOCH 3 - PROGRESS: at 40.30% examples, 200132 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:41:46,842 : INFO : EPOCH 3 - PROGRESS: at 40.70% examples, 200333 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:41:47,886 : INFO : EPOCH 3 - PROGRESS: at 41.02% examples, 200273 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:41:48,959 : INFO : EPOCH 3 - PROGRESS: at 41.35% examples, 200418 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:41:49,972 : INFO : EPOCH 3 - PROGRESS: at 41.64% examples, 200511 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:41:51,226 : INFO : EPOCH 3 - PROGRESS: at 41.94% examples, 200099 words/s, in_qsize 13, out_qsize 10
2021-09-02 12:41:52,231 : INFO : EPOCH 3 - PROGRESS: at 42.36% examples, 200442 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:41:53,257 : INFO : EPOCH 3 - PROGRESS: at 42.61% examples, 200252 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:41:54,466 : INFO : EPOCH 3 - PROGRESS: at 43.05% examples, 200328 words/s

2021-09-02 12:43:01,119 : INFO : EPOCH 3 - PROGRESS: at 64.62% examples, 201370 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:43:02,136 : INFO : EPOCH 3 - PROGRESS: at 65.01% examples, 201414 words/s, in_qsize 21, out_qsize 1
2021-09-02 12:43:03,189 : INFO : EPOCH 3 - PROGRESS: at 65.29% examples, 201372 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:43:04,234 : INFO : EPOCH 3 - PROGRESS: at 65.73% examples, 201490 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:43:05,236 : INFO : EPOCH 3 - PROGRESS: at 66.28% examples, 201579 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:43:06,302 : INFO : EPOCH 3 - PROGRESS: at 66.58% examples, 201589 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:43:07,308 : INFO : EPOCH 3 - PROGRESS: at 66.86% examples, 201488 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:43:08,308 : INFO : EPOCH 3 - PROGRESS: at 67.13% examples, 201377 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:43:09,364 : INFO : EPOCH 3 - PROGRESS: at 67.43% examples, 201322 words/s,

2021-09-02 12:44:16,125 : INFO : EPOCH 3 - PROGRESS: at 89.31% examples, 201552 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:44:17,152 : INFO : EPOCH 3 - PROGRESS: at 89.60% examples, 201488 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:44:18,185 : INFO : EPOCH 3 - PROGRESS: at 90.03% examples, 201553 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:44:19,212 : INFO : EPOCH 3 - PROGRESS: at 90.37% examples, 201570 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:44:20,226 : INFO : EPOCH 3 - PROGRESS: at 90.70% examples, 201605 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:44:21,230 : INFO : EPOCH 3 - PROGRESS: at 90.97% examples, 201524 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:44:22,231 : INFO : EPOCH 3 - PROGRESS: at 91.28% examples, 201616 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:44:23,315 : INFO : EPOCH 3 - PROGRESS: at 91.60% examples, 201582 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:44:24,345 : INFO : EPOCH 3 - PROGRESS: at 92.12% examples, 201647 words/s,

2021-09-02 12:45:21,226 : INFO : EPOCH 4 - PROGRESS: at 11.09% examples, 205884 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:45:22,256 : INFO : EPOCH 4 - PROGRESS: at 11.38% examples, 205307 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:45:23,258 : INFO : EPOCH 4 - PROGRESS: at 11.73% examples, 205731 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:45:24,266 : INFO : EPOCH 4 - PROGRESS: at 12.01% examples, 205122 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:45:25,388 : INFO : EPOCH 4 - PROGRESS: at 12.37% examples, 205285 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:45:26,515 : INFO : EPOCH 4 - PROGRESS: at 12.60% examples, 204394 words/s, in_qsize 17, out_qsize 4
2021-09-02 12:45:27,609 : INFO : EPOCH 4 - PROGRESS: at 12.97% examples, 204338 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:45:28,733 : INFO : EPOCH 4 - PROGRESS: at 13.23% examples, 204609 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:45:29,735 : INFO : EPOCH 4 - PROGRESS: at 13.54% examples, 205147 words/s,

2021-09-02 12:46:35,746 : INFO : EPOCH 4 - PROGRESS: at 32.88% examples, 203184 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:36,899 : INFO : EPOCH 4 - PROGRESS: at 33.18% examples, 203294 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:37,901 : INFO : EPOCH 4 - PROGRESS: at 33.49% examples, 203361 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:38,903 : INFO : EPOCH 4 - PROGRESS: at 33.74% examples, 203475 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:39,976 : INFO : EPOCH 4 - PROGRESS: at 34.06% examples, 203609 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:40,982 : INFO : EPOCH 4 - PROGRESS: at 34.36% examples, 203571 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:42,018 : INFO : EPOCH 4 - PROGRESS: at 34.74% examples, 203688 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:46:43,108 : INFO : EPOCH 4 - PROGRESS: at 35.18% examples, 203845 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:46:44,133 : INFO : EPOCH 4 - PROGRESS: at 35.49% examples, 203800 words/s,

2021-09-02 12:47:50,509 : INFO : EPOCH 4 - PROGRESS: at 56.81% examples, 204970 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:47:51,576 : INFO : EPOCH 4 - PROGRESS: at 57.11% examples, 204894 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:47:52,580 : INFO : EPOCH 4 - PROGRESS: at 57.45% examples, 205042 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:47:53,597 : INFO : EPOCH 4 - PROGRESS: at 57.78% examples, 205132 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:47:54,632 : INFO : EPOCH 4 - PROGRESS: at 58.18% examples, 205066 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:47:55,637 : INFO : EPOCH 4 - PROGRESS: at 58.63% examples, 205233 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:47:56,656 : INFO : EPOCH 4 - PROGRESS: at 58.97% examples, 205293 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:47:57,666 : INFO : EPOCH 4 - PROGRESS: at 59.32% examples, 205306 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:47:58,759 : INFO : EPOCH 4 - PROGRESS: at 59.83% examples, 205362 words/s,

2021-09-02 12:49:04,783 : INFO : EPOCH 4 - PROGRESS: at 81.38% examples, 205473 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:49:05,947 : INFO : EPOCH 4 - PROGRESS: at 81.80% examples, 205500 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:49:06,960 : INFO : EPOCH 4 - PROGRESS: at 82.15% examples, 205606 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:49:07,973 : INFO : EPOCH 4 - PROGRESS: at 82.47% examples, 205526 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:49:09,009 : INFO : EPOCH 4 - PROGRESS: at 82.84% examples, 205557 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:49:10,023 : INFO : EPOCH 4 - PROGRESS: at 83.19% examples, 205625 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:49:11,045 : INFO : EPOCH 4 - PROGRESS: at 83.78% examples, 205653 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:49:12,051 : INFO : EPOCH 4 - PROGRESS: at 84.05% examples, 205711 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:49:13,081 : INFO : EPOCH 4 - PROGRESS: at 84.32% examples, 205612 words/s,

2021-09-02 12:50:09,921 : INFO : EPOCH 5 - PROGRESS: at 4.30% examples, 198617 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:50:10,952 : INFO : EPOCH 5 - PROGRESS: at 4.60% examples, 197119 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:50:11,955 : INFO : EPOCH 5 - PROGRESS: at 4.86% examples, 195459 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:50:12,982 : INFO : EPOCH 5 - PROGRESS: at 5.18% examples, 194536 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:50:14,013 : INFO : EPOCH 5 - PROGRESS: at 5.45% examples, 193567 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:50:15,021 : INFO : EPOCH 5 - PROGRESS: at 5.60% examples, 193095 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:50:16,076 : INFO : EPOCH 5 - PROGRESS: at 5.95% examples, 194556 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:50:17,094 : INFO : EPOCH 5 - PROGRESS: at 6.23% examples, 194887 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:50:18,104 : INFO : EPOCH 5 - PROGRESS: at 6.55% examples, 195863 words/s, in_qsize

2021-09-02 12:51:26,236 : INFO : EPOCH 5 - PROGRESS: at 26.39% examples, 199950 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:51:27,247 : INFO : EPOCH 5 - PROGRESS: at 26.77% examples, 199990 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:51:28,291 : INFO : EPOCH 5 - PROGRESS: at 27.07% examples, 200001 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:51:29,300 : INFO : EPOCH 5 - PROGRESS: at 27.33% examples, 200002 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:51:30,314 : INFO : EPOCH 5 - PROGRESS: at 27.55% examples, 199722 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:51:31,329 : INFO : EPOCH 5 - PROGRESS: at 27.88% examples, 199877 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:51:32,387 : INFO : EPOCH 5 - PROGRESS: at 28.07% examples, 199668 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:51:33,388 : INFO : EPOCH 5 - PROGRESS: at 28.36% examples, 199644 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:51:34,422 : INFO : EPOCH 5 - PROGRESS: at 28.66% examples, 199756 words/s,

2021-09-02 12:52:40,779 : INFO : EPOCH 5 - PROGRESS: at 48.92% examples, 201117 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:52:41,794 : INFO : EPOCH 5 - PROGRESS: at 49.24% examples, 201220 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:52:42,891 : INFO : EPOCH 5 - PROGRESS: at 49.50% examples, 201046 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:52:43,915 : INFO : EPOCH 5 - PROGRESS: at 49.84% examples, 201203 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:52:44,963 : INFO : EPOCH 5 - PROGRESS: at 50.18% examples, 201146 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:52:46,004 : INFO : EPOCH 5 - PROGRESS: at 50.50% examples, 201146 words/s, in_qsize 22, out_qsize 1
2021-09-02 12:52:47,010 : INFO : EPOCH 5 - PROGRESS: at 50.82% examples, 201254 words/s, in_qsize 23, out_qsize 1
2021-09-02 12:52:48,034 : INFO : EPOCH 5 - PROGRESS: at 51.13% examples, 201286 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:52:49,105 : INFO : EPOCH 5 - PROGRESS: at 51.44% examples, 201284 words/s,

2021-09-02 12:53:55,560 : INFO : EPOCH 5 - PROGRESS: at 73.35% examples, 202135 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:53:56,571 : INFO : EPOCH 5 - PROGRESS: at 73.63% examples, 202211 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:53:57,616 : INFO : EPOCH 5 - PROGRESS: at 73.94% examples, 202145 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:53:58,830 : INFO : EPOCH 5 - PROGRESS: at 74.32% examples, 202143 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:53:59,836 : INFO : EPOCH 5 - PROGRESS: at 74.75% examples, 202254 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:54:00,850 : INFO : EPOCH 5 - PROGRESS: at 75.02% examples, 202190 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:54:01,852 : INFO : EPOCH 5 - PROGRESS: at 75.40% examples, 202318 words/s, in_qsize 24, out_qsize 0
2021-09-02 12:54:02,870 : INFO : EPOCH 5 - PROGRESS: at 75.73% examples, 202264 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:54:03,954 : INFO : EPOCH 5 - PROGRESS: at 75.98% examples, 202097 words/s,

2021-09-02 12:55:10,718 : INFO : EPOCH 5 - PROGRESS: at 99.62% examples, 202799 words/s, in_qsize 23, out_qsize 0
2021-09-02 12:55:11,632 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-09-02 12:55:11,643 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-09-02 12:55:11,650 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-09-02 12:55:11,658 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-09-02 12:55:11,668 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-09-02 12:55:11,694 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-09-02 12:55:11,711 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-09-02 12:55:11,737 : INFO : EPOCH 5 - PROGRESS: at 99.95% examples, 202911 words/s, in_qsize 4, out_qsize 1
2021-09-02 12:55:11,738 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-09-02 12:55:11,801 : INFO : worker thr

saved data/cofea_full/range_data/evans_even_sample_phrase_1730_1810.wv.npy
saved data/cofea_full/range_data/evans_even_sample_phrase_1730_1810.vocab


In [43]:
# want decade or by source
group = [1640,1650]
file_loc = join(cofea_dir,'decades')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
for x in group: # TODO :Update to use different split based on authors 
    print(x)
    fname = 'data_'+str(x)
    fname = join(file_loc,fname) 
    run_and_save(fname)

2021-07-20 17:19:37,419 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2021-07-20T17:19:37.419540', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2021-07-20 17:19:37,428 : INFO : collecting all words and their counts
2021-07-20 17:19:37,430 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-07-20 17:19:37,547 : INFO : collected 10014 word types from a corpus of 142510 raw words and 4377 sentences
2021-07-20 17:19:37,549 : INFO : Creating a fresh vocabulary
2021-07-20 17:19:37,589 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2189 unique words (21.859396844417816%% of original 10014, drops 7825)', 'datetime': '2021-07-20T17:19:37.588944', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64

1640


2021-07-20 17:19:37,654 : INFO : sample=0.001 downsamples 66 most-common words
2021-07-20 17:19:37,656 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 84049.85030880866 word corpus (64.4%% of prior 130454)', 'datetime': '2021-07-20T17:19:37.656692', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-07-20 17:19:37,724 : INFO : estimated required memory for 2189 words and 300 dimensions: 6348100 bytes
2021-07-20 17:19:37,725 : INFO : resetting layer weights
2021-07-20 17:19:37,739 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2021-07-20T17:19:37.739874', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'build_vocab'}
2021-07-20 17:19:37,741 : INFO : Word2Vec lifecycle event {'msg': 'training model with 12 workers on 21

2021-07-20 17:19:40,888 : INFO : collecting all words and their counts
2021-07-20 17:19:40,889 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-07-20 17:19:41,010 : INFO : PROGRESS: at sentence #10000, processed 193795 words, keeping 12692 word types
2021-07-20 17:19:41,020 : INFO : collected 13069 word types from a corpus of 207984 raw words and 11096 sentences
2021-07-20 17:19:41,021 : INFO : Creating a fresh vocabulary
2021-07-20 17:19:41,055 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2747 unique words (21.019205754074527%% of original 13069, drops 10322)', 'datetime': '2021-07-20T17:19:41.055655', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-07-20 17:19:41,056 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 192252 word corpus (92.43595661204708%% of original 207984, drops 15732)',

saved data/cofea_full/decades/data_1640.wv.npy
saved data/cofea_full/decades/data_1640.vocab
1650


2021-07-20 17:19:41,106 : INFO : sample=0.001 downsamples 60 most-common words
2021-07-20 17:19:41,114 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 126729.27259809534 word corpus (65.9%% of prior 192252)', 'datetime': '2021-07-20T17:19:41.114694', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-07-20 17:19:41,174 : INFO : estimated required memory for 2747 words and 300 dimensions: 7966300 bytes
2021-07-20 17:19:41,175 : INFO : resetting layer weights
2021-07-20 17:19:41,182 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2021-07-20T17:19:41.182869', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  4 2020, 02:22:02) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'build_vocab'}
2021-07-20 17:19:41,184 : INFO : Word2Vec lifecycle event {'msg': 'training model with 12 workers on 2

saved data/cofea_full/decades/data_1650.wv.npy
saved data/cofea_full/decades/data_1650.vocab
