In [2]:
from gensim.models import FastText
import gensim
import pandas as pd

DATA_PATH = '/data/'
MODEL_PATH = DATA_PATH + 'models/ft4_ru/'
SOURCE_PATH = DATA_PATH + 'captions/lem/'

cities = ['moscow', 'spb']#, 'nyc', 'london']
years = ['2016', '2017', '2018', '2019', '2020']
files = []
for city in cities:
    for year in years:
        files.append([city, year])

def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'

valid_langs = set(['__label__ru'])

USE_PRETRAINED_MODEL = False
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
!mkdir {MODEL_PATH}
class MyCorpus(object):
    iter = 0
    def __iter__(self):
        self.iter += 1
        for city in cities:
            for year in years:
                df = pd.read_csv(csv_path(SOURCE_PATH, city, year))
                df = df[df.lang.isin(valid_langs)]
                for s in df['caption']:
                    yield s.split()
                              
            print(f'iter {self.iter} for {city} completed')
                
        print(f'\niter {self.iter} completed\n')

sentences = MyCorpus()

# dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives
# https://fasttext.cc/docs/en/crawl-vectors.html

if USE_PRETRAINED_MODEL:
    model =  gensim.models.fasttext.load_facebook_model(MODEL_PATH + 'wiki.ru.bin')
else:
    model = FastText(size=300, window=15, min_count=10, word_ngrams=2, negative=20, workers=20)

2020-09-01 10:51:01,580 : INFO : resetting layer weights


success, vocab: 0


In [5]:
print('building vocab\n')
if USE_PRETRAINED_MODEL:
    model.build_vocab(sentences=MyCorpus(), update=True)
else:
    model.build_vocab(sentences=MyCorpus())
total_examples = model.corpus_count
print(f'\ncorpus count: {total_examples}; vocab: {len(model.wv.vocab)}\nend of building')

building vocab



2020-09-01 11:07:18,739 : INFO : collecting all words and their counts
2020-09-01 11:07:56,356 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-01 11:07:56,516 : INFO : PROGRESS: at sentence #10000, processed 492015 words, keeping 78321 word types
2020-09-01 11:07:56,657 : INFO : PROGRESS: at sentence #20000, processed 931761 words, keeping 122939 word types
2020-09-01 11:07:56,788 : INFO : PROGRESS: at sentence #30000, processed 1348618 words, keeping 157574 word types
2020-09-01 11:07:56,955 : INFO : PROGRESS: at sentence #40000, processed 1854711 words, keeping 197087 word types
2020-09-01 11:07:57,116 : INFO : PROGRESS: at sentence #50000, processed 2359849 words, keeping 232293 word types
2020-09-01 11:07:57,267 : INFO : PROGRESS: at sentence #60000, processed 2833623 words, keeping 263109 word types
2020-09-01 11:07:57,393 : INFO : PROGRESS: at sentence #70000, processed 3223635 words, keeping 287124 word types
2020-09-01 11:07:57,528 : INFO : PR

2020-09-01 11:08:07,558 : INFO : PROGRESS: at sentence #710000, processed 34293799 words, keeping 1358192 word types
2020-09-01 11:08:07,703 : INFO : PROGRESS: at sentence #720000, processed 34731072 words, keeping 1370473 word types
2020-09-01 11:08:07,841 : INFO : PROGRESS: at sentence #730000, processed 35142413 words, keeping 1381997 word types
2020-09-01 11:08:07,978 : INFO : PROGRESS: at sentence #740000, processed 35553613 words, keeping 1392708 word types
2020-09-01 11:08:08,165 : INFO : PROGRESS: at sentence #750000, processed 35957603 words, keeping 1403117 word types
2020-09-01 11:08:08,314 : INFO : PROGRESS: at sentence #760000, processed 36404756 words, keeping 1413835 word types
2020-09-01 11:08:08,449 : INFO : PROGRESS: at sentence #770000, processed 36817435 words, keeping 1424543 word types
2020-09-01 11:08:08,599 : INFO : PROGRESS: at sentence #780000, processed 37270893 words, keeping 1435990 word types
2020-09-01 11:08:08,762 : INFO : PROGRESS: at sentence #790000, 

2020-09-01 11:08:18,479 : INFO : PROGRESS: at sentence #1410000, processed 67365192 words, keeping 2099162 word types
2020-09-01 11:08:18,672 : INFO : PROGRESS: at sentence #1420000, processed 67945905 words, keeping 2108901 word types
2020-09-01 11:08:18,855 : INFO : PROGRESS: at sentence #1430000, processed 68497467 words, keeping 2118973 word types
2020-09-01 11:08:19,007 : INFO : PROGRESS: at sentence #1440000, processed 68947497 words, keeping 2128346 word types
2020-09-01 11:08:19,163 : INFO : PROGRESS: at sentence #1450000, processed 69415690 words, keeping 2137161 word types
2020-09-01 11:08:19,319 : INFO : PROGRESS: at sentence #1460000, processed 69878937 words, keeping 2146997 word types
2020-09-01 11:08:19,479 : INFO : PROGRESS: at sentence #1470000, processed 70352513 words, keeping 2157020 word types
2020-09-01 11:08:19,638 : INFO : PROGRESS: at sentence #1480000, processed 70830601 words, keeping 2166270 word types
2020-09-01 11:08:19,788 : INFO : PROGRESS: at sentence #

2020-09-01 11:08:29,814 : INFO : PROGRESS: at sentence #2110000, processed 101164204 words, keeping 2724486 word types
2020-09-01 11:08:29,969 : INFO : PROGRESS: at sentence #2120000, processed 101628950 words, keeping 2732237 word types
2020-09-01 11:08:30,107 : INFO : PROGRESS: at sentence #2130000, processed 102033342 words, keeping 2740486 word types
2020-09-01 11:08:30,275 : INFO : PROGRESS: at sentence #2140000, processed 102538418 words, keeping 2749294 word types
2020-09-01 11:08:30,433 : INFO : PROGRESS: at sentence #2150000, processed 103008763 words, keeping 2757414 word types
2020-09-01 11:08:30,592 : INFO : PROGRESS: at sentence #2160000, processed 103488707 words, keeping 2765186 word types
2020-09-01 11:08:30,778 : INFO : PROGRESS: at sentence #2170000, processed 104053197 words, keeping 2774369 word types
2020-09-01 11:08:30,957 : INFO : PROGRESS: at sentence #2180000, processed 104600444 words, keeping 2783747 word types
2020-09-01 11:08:31,138 : INFO : PROGRESS: at se

2020-09-01 11:08:41,016 : INFO : PROGRESS: at sentence #2800000, processed 134527107 words, keeping 3267922 word types
2020-09-01 11:08:41,193 : INFO : PROGRESS: at sentence #2810000, processed 135055950 words, keeping 3275558 word types
2020-09-01 11:08:41,353 : INFO : PROGRESS: at sentence #2820000, processed 135537019 words, keeping 3282697 word types
2020-09-01 11:08:41,490 : INFO : PROGRESS: at sentence #2830000, processed 135937539 words, keeping 3289406 word types
2020-09-01 11:08:41,656 : INFO : PROGRESS: at sentence #2840000, processed 136429274 words, keeping 3296493 word types
2020-09-01 11:08:41,840 : INFO : PROGRESS: at sentence #2850000, processed 136975941 words, keeping 3304139 word types
2020-09-01 11:08:42,013 : INFO : PROGRESS: at sentence #2860000, processed 137485793 words, keeping 3311678 word types
2020-09-01 11:08:42,175 : INFO : PROGRESS: at sentence #2870000, processed 137962018 words, keeping 3319573 word types
2020-09-01 11:08:42,346 : INFO : PROGRESS: at se

2020-09-01 11:08:52,336 : INFO : PROGRESS: at sentence #3490000, processed 168175169 words, keeping 3764790 word types
2020-09-01 11:08:52,507 : INFO : PROGRESS: at sentence #3500000, processed 168682012 words, keeping 3771792 word types
2020-09-01 11:08:52,679 : INFO : PROGRESS: at sentence #3510000, processed 169192252 words, keeping 3778883 word types
2020-09-01 11:08:52,863 : INFO : PROGRESS: at sentence #3520000, processed 169741092 words, keeping 3786480 word types
2020-09-01 11:08:53,049 : INFO : PROGRESS: at sentence #3530000, processed 170300344 words, keeping 3793580 word types
2020-09-01 11:08:53,231 : INFO : PROGRESS: at sentence #3540000, processed 170843968 words, keeping 3801083 word types
2020-09-01 11:08:53,399 : INFO : PROGRESS: at sentence #3550000, processed 171333046 words, keeping 3808344 word types
2020-09-01 11:08:53,560 : INFO : PROGRESS: at sentence #3560000, processed 171807953 words, keeping 3815345 word types
2020-09-01 11:08:53,707 : INFO : PROGRESS: at se

2020-09-01 11:09:04,109 : INFO : PROGRESS: at sentence #4180000, processed 202781873 words, keeping 4227400 word types
2020-09-01 11:09:04,276 : INFO : PROGRESS: at sentence #4190000, processed 203264384 words, keeping 4233811 word types
2020-09-01 11:09:04,434 : INFO : PROGRESS: at sentence #4200000, processed 203722691 words, keeping 4240409 word types
2020-09-01 11:09:04,563 : INFO : PROGRESS: at sentence #4210000, processed 204088841 words, keeping 4246294 word types
2020-09-01 11:09:04,721 : INFO : PROGRESS: at sentence #4220000, processed 204546257 words, keeping 4252924 word types
2020-09-01 11:09:04,886 : INFO : PROGRESS: at sentence #4230000, processed 205026366 words, keeping 4259321 word types
2020-09-01 11:09:05,059 : INFO : PROGRESS: at sentence #4240000, processed 205528574 words, keeping 4265804 word types
2020-09-01 11:09:05,230 : INFO : PROGRESS: at sentence #4250000, processed 206034341 words, keeping 4272058 word types
2020-09-01 11:09:05,404 : INFO : PROGRESS: at se

2020-09-01 11:09:15,623 : INFO : PROGRESS: at sentence #4870000, processed 236080823 words, keeping 4662331 word types
2020-09-01 11:09:15,781 : INFO : PROGRESS: at sentence #4880000, processed 236519226 words, keeping 4667938 word types
2020-09-01 11:09:15,956 : INFO : PROGRESS: at sentence #4890000, processed 237016807 words, keeping 4673879 word types
2020-09-01 11:09:16,160 : INFO : PROGRESS: at sentence #4900000, processed 237606807 words, keeping 4680492 word types
2020-09-01 11:10:23,830 : INFO : PROGRESS: at sentence #4910000, processed 238189240 words, keeping 4687838 word types
2020-09-01 11:10:24,046 : INFO : PROGRESS: at sentence #4920000, processed 238800910 words, keeping 4700297 word types
2020-09-01 11:10:24,254 : INFO : PROGRESS: at sentence #4930000, processed 239408604 words, keeping 4712345 word types
2020-09-01 11:10:24,426 : INFO : PROGRESS: at sentence #4940000, processed 239909734 words, keeping 4723388 word types
2020-09-01 11:10:24,593 : INFO : PROGRESS: at se

2020-09-01 11:10:36,615 : INFO : PROGRESS: at sentence #5560000, processed 275880460 words, keeping 5297381 word types
2020-09-01 11:10:36,808 : INFO : PROGRESS: at sentence #5570000, processed 276451527 words, keeping 5305132 word types
2020-09-01 11:10:37,016 : INFO : PROGRESS: at sentence #5580000, processed 277061488 words, keeping 5313368 word types
2020-09-01 11:10:37,216 : INFO : PROGRESS: at sentence #5590000, processed 277652626 words, keeping 5321098 word types
2020-09-01 11:10:37,424 : INFO : PROGRESS: at sentence #5600000, processed 278274906 words, keeping 5329773 word types
2020-09-01 11:10:37,624 : INFO : PROGRESS: at sentence #5610000, processed 278873385 words, keeping 5338247 word types
2020-09-01 11:10:37,830 : INFO : PROGRESS: at sentence #5620000, processed 279484284 words, keeping 5346702 word types
2020-09-01 11:10:38,025 : INFO : PROGRESS: at sentence #5630000, processed 280060073 words, keeping 5354474 word types
2020-09-01 11:10:38,229 : INFO : PROGRESS: at se

2020-09-01 11:10:50,048 : INFO : PROGRESS: at sentence #6250000, processed 315240821 words, keeping 5828555 word types
2020-09-01 11:10:50,245 : INFO : PROGRESS: at sentence #6260000, processed 315833237 words, keeping 5836037 word types
2020-09-01 11:10:50,437 : INFO : PROGRESS: at sentence #6270000, processed 316408176 words, keeping 5843220 word types
2020-09-01 11:10:50,642 : INFO : PROGRESS: at sentence #6280000, processed 317009482 words, keeping 5850467 word types
2020-09-01 11:10:50,838 : INFO : PROGRESS: at sentence #6290000, processed 317593227 words, keeping 5857085 word types
2020-09-01 11:10:51,019 : INFO : PROGRESS: at sentence #6300000, processed 318131463 words, keeping 5864269 word types
2020-09-01 11:10:51,195 : INFO : PROGRESS: at sentence #6310000, processed 318659556 words, keeping 5871670 word types
2020-09-01 11:10:51,372 : INFO : PROGRESS: at sentence #6320000, processed 319179859 words, keeping 5878138 word types
2020-09-01 11:10:51,541 : INFO : PROGRESS: at se

2020-09-01 11:11:02,990 : INFO : PROGRESS: at sentence #6940000, processed 353904633 words, keeping 6297533 word types
2020-09-01 11:11:03,189 : INFO : PROGRESS: at sentence #6950000, processed 354500193 words, keeping 6304998 word types
2020-09-01 11:11:03,377 : INFO : PROGRESS: at sentence #6960000, processed 355044884 words, keeping 6312181 word types
2020-09-01 11:11:03,550 : INFO : PROGRESS: at sentence #6970000, processed 355547595 words, keeping 6318303 word types
2020-09-01 11:11:03,750 : INFO : PROGRESS: at sentence #6980000, processed 356147265 words, keeping 6325278 word types
2020-09-01 11:11:03,953 : INFO : PROGRESS: at sentence #6990000, processed 356759149 words, keeping 6331778 word types
2020-09-01 11:11:04,159 : INFO : PROGRESS: at sentence #7000000, processed 357378690 words, keeping 6338715 word types
2020-09-01 11:11:04,367 : INFO : PROGRESS: at sentence #7010000, processed 357997552 words, keeping 6345361 word types
2020-09-01 11:11:04,565 : INFO : PROGRESS: at se

2020-09-01 11:11:16,858 : INFO : PROGRESS: at sentence #7630000, processed 395177398 words, keeping 6757827 word types
2020-09-01 11:11:17,062 : INFO : PROGRESS: at sentence #7640000, processed 395778883 words, keeping 6763980 word types
2020-09-01 11:11:17,263 : INFO : PROGRESS: at sentence #7650000, processed 396373522 words, keeping 6770478 word types
2020-09-01 11:11:17,469 : INFO : PROGRESS: at sentence #7660000, processed 396993275 words, keeping 6777141 word types
2020-09-01 11:11:17,665 : INFO : PROGRESS: at sentence #7670000, processed 397580274 words, keeping 6783405 word types
2020-09-01 11:11:17,871 : INFO : PROGRESS: at sentence #7680000, processed 398190164 words, keeping 6790166 word types
2020-09-01 11:11:18,064 : INFO : PROGRESS: at sentence #7690000, processed 398766015 words, keeping 6796540 word types
2020-09-01 11:11:18,247 : INFO : PROGRESS: at sentence #7700000, processed 399308175 words, keeping 6802881 word types
2020-09-01 11:11:18,408 : INFO : PROGRESS: at se

2020-09-01 11:11:29,710 : INFO : PROGRESS: at sentence #8320000, processed 432562163 words, keeping 7169854 word types
2020-09-01 11:11:29,902 : INFO : PROGRESS: at sentence #8330000, processed 433128742 words, keeping 7175605 word types
2020-09-01 11:11:30,115 : INFO : PROGRESS: at sentence #8340000, processed 433758791 words, keeping 7181954 word types
2020-09-01 11:11:30,319 : INFO : PROGRESS: at sentence #8350000, processed 434355425 words, keeping 7188074 word types
2020-09-01 11:11:30,506 : INFO : PROGRESS: at sentence #8360000, processed 434897603 words, keeping 7194021 word types
2020-09-01 11:11:30,677 : INFO : PROGRESS: at sentence #8370000, processed 435398414 words, keeping 7199705 word types
2020-09-01 11:11:30,873 : INFO : PROGRESS: at sentence #8380000, processed 435971034 words, keeping 7205824 word types
2020-09-01 11:11:31,079 : INFO : PROGRESS: at sentence #8390000, processed 436574832 words, keeping 7211605 word types
2020-09-01 11:11:31,293 : INFO : PROGRESS: at se

2020-09-01 11:11:43,334 : INFO : PROGRESS: at sentence #9010000, processed 472612707 words, keeping 7572498 word types
2020-09-01 11:11:43,509 : INFO : PROGRESS: at sentence #9020000, processed 473121861 words, keeping 7577803 word types
2020-09-01 11:11:43,691 : INFO : PROGRESS: at sentence #9030000, processed 473647477 words, keeping 7583433 word types
2020-09-01 11:11:43,887 : INFO : PROGRESS: at sentence #9040000, processed 474227033 words, keeping 7589060 word types
2020-09-01 11:11:44,081 : INFO : PROGRESS: at sentence #9050000, processed 474801782 words, keeping 7594652 word types
2020-09-01 11:11:44,260 : INFO : PROGRESS: at sentence #9060000, processed 475323473 words, keeping 7599907 word types
2020-09-01 11:11:44,432 : INFO : PROGRESS: at sentence #9070000, processed 475823859 words, keeping 7604981 word types
2020-09-01 11:11:44,639 : INFO : PROGRESS: at sentence #9080000, processed 476423710 words, keeping 7610636 word types
2020-09-01 11:11:44,844 : INFO : PROGRESS: at se

2020-09-01 11:11:56,123 : INFO : PROGRESS: at sentence #9700000, processed 510409162 words, keeping 7947937 word types
2020-09-01 11:11:56,328 : INFO : PROGRESS: at sentence #9710000, processed 511017304 words, keeping 7953767 word types
2020-09-01 11:11:56,517 : INFO : PROGRESS: at sentence #9720000, processed 511575819 words, keeping 7959136 word types
2020-09-01 11:11:56,674 : INFO : PROGRESS: at sentence #9730000, processed 512033329 words, keeping 7964134 word types
2020-09-01 11:11:56,851 : INFO : PROGRESS: at sentence #9740000, processed 512551832 words, keeping 7969317 word types
2020-09-01 11:11:57,060 : INFO : PROGRESS: at sentence #9750000, processed 513163030 words, keeping 7974547 word types
2020-09-01 11:11:57,267 : INFO : PROGRESS: at sentence #9760000, processed 513772725 words, keeping 7980166 word types
2020-09-01 11:11:57,460 : INFO : PROGRESS: at sentence #9770000, processed 514351364 words, keeping 7985434 word types
2020-09-01 11:11:57,660 : INFO : PROGRESS: at se

2020-09-01 11:12:09,344 : INFO : PROGRESS: at sentence #10390000, processed 549466679 words, keeping 8311031 word types
2020-09-01 11:12:09,512 : INFO : PROGRESS: at sentence #10400000, processed 549961530 words, keeping 8315836 word types
2020-09-01 11:12:09,713 : INFO : PROGRESS: at sentence #10410000, processed 550556980 words, keeping 8321072 word types
2020-09-01 11:12:09,915 : INFO : PROGRESS: at sentence #10420000, processed 551165186 words, keeping 8326411 word types
2020-09-01 11:12:10,126 : INFO : PROGRESS: at sentence #10430000, processed 551795169 words, keeping 8332040 word types
2020-09-01 11:12:10,328 : INFO : PROGRESS: at sentence #10440000, processed 552391429 words, keeping 8337566 word types
2020-09-01 11:12:10,528 : INFO : PROGRESS: at sentence #10450000, processed 552982121 words, keeping 8342916 word types
2020-09-01 11:12:10,715 : INFO : PROGRESS: at sentence #10460000, processed 553534922 words, keeping 8347648 word types
2020-09-01 11:12:10,917 : INFO : PROGRES

2020-09-01 11:12:22,584 : INFO : PROGRESS: at sentence #11080000, processed 587530017 words, keeping 8659835 word types
2020-09-01 11:12:22,797 : INFO : PROGRESS: at sentence #11090000, processed 588154755 words, keeping 8665326 word types
2020-09-01 11:12:23,004 : INFO : PROGRESS: at sentence #11100000, processed 588762650 words, keeping 8670740 word types
2020-09-01 11:12:23,196 : INFO : PROGRESS: at sentence #11110000, processed 589327786 words, keeping 8675828 word types
2020-09-01 11:12:23,374 : INFO : PROGRESS: at sentence #11120000, processed 589847701 words, keeping 8680833 word types
2020-09-01 11:12:23,568 : INFO : PROGRESS: at sentence #11130000, processed 590417612 words, keeping 8685924 word types
2020-09-01 11:12:23,767 : INFO : PROGRESS: at sentence #11140000, processed 590992329 words, keeping 8690866 word types
2020-09-01 11:12:23,966 : INFO : PROGRESS: at sentence #11150000, processed 591573178 words, keeping 8696286 word types
2020-09-01 11:12:24,172 : INFO : PROGRES

2020-09-01 11:12:35,430 : INFO : PROGRESS: at sentence #11770000, processed 625185794 words, keeping 8998574 word types
2020-09-01 11:12:35,600 : INFO : PROGRESS: at sentence #11780000, processed 625706964 words, keeping 9003263 word types
2020-09-01 11:12:35,799 : INFO : PROGRESS: at sentence #11790000, processed 626297283 words, keeping 9008278 word types
2020-09-01 11:12:36,006 : INFO : PROGRESS: at sentence #11800000, processed 626902917 words, keeping 9013622 word types
2020-09-01 11:12:36,219 : INFO : PROGRESS: at sentence #11810000, processed 627539805 words, keeping 9018912 word types
2020-09-01 11:12:36,418 : INFO : PROGRESS: at sentence #11820000, processed 628133609 words, keeping 9023836 word types
2020-09-01 11:12:36,607 : INFO : PROGRESS: at sentence #11830000, processed 628690778 words, keeping 9028770 word types
2020-09-01 11:12:36,783 : INFO : PROGRESS: at sentence #11840000, processed 629192465 words, keeping 9033594 word types
2020-09-01 11:12:36,992 : INFO : PROGRES

2020-09-01 11:12:48,514 : INFO : PROGRESS: at sentence #12460000, processed 663497259 words, keeping 9333390 word types
2020-09-01 11:12:48,704 : INFO : PROGRESS: at sentence #12470000, processed 664049266 words, keeping 9338375 word types
2020-09-01 11:12:48,881 : INFO : PROGRESS: at sentence #12480000, processed 664565950 words, keeping 9343002 word types
2020-09-01 11:13:22,129 : INFO : PROGRESS: at sentence #12490000, processed 665095005 words, keeping 9349264 word types
2020-09-01 11:13:22,316 : INFO : PROGRESS: at sentence #12500000, processed 665597204 words, keeping 9355830 word types
2020-09-01 11:13:22,500 : INFO : PROGRESS: at sentence #12510000, processed 666109199 words, keeping 9362088 word types
2020-09-01 11:13:22,666 : INFO : PROGRESS: at sentence #12520000, processed 666567643 words, keeping 9367024 word types
2020-09-01 11:13:22,843 : INFO : PROGRESS: at sentence #12530000, processed 667064008 words, keeping 9372009 word types
2020-09-01 11:13:23,075 : INFO : PROGRES

2020-09-01 11:13:35,957 : INFO : PROGRESS: at sentence #13150000, processed 706292077 words, keeping 9781768 word types
2020-09-01 11:13:36,113 : INFO : PROGRESS: at sentence #13160000, processed 706755014 words, keeping 9786580 word types
2020-09-01 11:13:36,288 : INFO : PROGRESS: at sentence #13170000, processed 707279329 words, keeping 9791656 word types
2020-09-01 11:13:36,522 : INFO : PROGRESS: at sentence #13180000, processed 707994640 words, keeping 9798877 word types
2020-09-01 11:13:36,744 : INFO : PROGRESS: at sentence #13190000, processed 708682642 words, keeping 9805670 word types
2020-09-01 11:13:36,959 : INFO : PROGRESS: at sentence #13200000, processed 709341623 words, keeping 9812066 word types
2020-09-01 11:13:37,165 : INFO : PROGRESS: at sentence #13210000, processed 709963816 words, keeping 9819147 word types
2020-09-01 11:13:37,350 : INFO : PROGRESS: at sentence #13220000, processed 710504229 words, keeping 9824453 word types
2020-09-01 11:13:37,540 : INFO : PROGRES

2020-09-01 11:13:50,214 : INFO : PROGRESS: at sentence #13840000, processed 749149846 words, keeping 10177280 word types
2020-09-01 11:13:50,428 : INFO : PROGRESS: at sentence #13850000, processed 749794127 words, keeping 10183329 word types
2020-09-01 11:13:50,655 : INFO : PROGRESS: at sentence #13860000, processed 750478182 words, keeping 10190599 word types
2020-09-01 11:13:50,843 : INFO : PROGRESS: at sentence #13870000, processed 751017995 words, keeping 10196124 word types
2020-09-01 11:13:51,020 : INFO : PROGRESS: at sentence #13880000, processed 751533299 words, keeping 10200891 word types
2020-09-01 11:13:51,237 : INFO : PROGRESS: at sentence #13890000, processed 752157515 words, keeping 10206411 word types
2020-09-01 11:13:51,452 : INFO : PROGRESS: at sentence #13900000, processed 752782131 words, keeping 10211962 word types
2020-09-01 11:13:51,661 : INFO : PROGRESS: at sentence #13910000, processed 753399720 words, keeping 10217128 word types
2020-09-01 11:13:51,847 : INFO :

2020-09-01 11:14:04,095 : INFO : PROGRESS: at sentence #14520000, processed 789963920 words, keeping 10526692 word types
2020-09-01 11:14:04,262 : INFO : PROGRESS: at sentence #14530000, processed 790451484 words, keeping 10530900 word types
2020-09-01 11:14:04,468 : INFO : PROGRESS: at sentence #14540000, processed 791049236 words, keeping 10535748 word types
2020-09-01 11:14:04,687 : INFO : PROGRESS: at sentence #14550000, processed 791702289 words, keeping 10540825 word types
2020-09-01 11:14:04,899 : INFO : PROGRESS: at sentence #14560000, processed 792340914 words, keeping 10545715 word types
2020-09-01 11:14:05,121 : INFO : PROGRESS: at sentence #14570000, processed 793009608 words, keeping 10551166 word types
2020-09-01 11:14:05,359 : INFO : PROGRESS: at sentence #14580000, processed 793727593 words, keeping 10556902 word types
2020-09-01 11:14:05,583 : INFO : PROGRESS: at sentence #14590000, processed 794413162 words, keeping 10562714 word types
2020-09-01 11:14:05,805 : INFO :

2020-09-01 11:14:18,426 : INFO : PROGRESS: at sentence #15200000, processed 832587919 words, keeping 10866894 word types
2020-09-01 11:14:18,659 : INFO : PROGRESS: at sentence #15210000, processed 833284033 words, keeping 10872446 word types
2020-09-01 11:14:18,870 : INFO : PROGRESS: at sentence #15220000, processed 833911633 words, keeping 10878472 word types
2020-09-01 11:14:19,031 : INFO : PROGRESS: at sentence #15230000, processed 834385838 words, keeping 10882546 word types
2020-09-01 11:14:19,216 : INFO : PROGRESS: at sentence #15240000, processed 834929815 words, keeping 10886892 word types
2020-09-01 11:14:19,414 : INFO : PROGRESS: at sentence #15250000, processed 835517736 words, keeping 10891193 word types
2020-09-01 11:14:19,620 : INFO : PROGRESS: at sentence #15260000, processed 836126714 words, keeping 10895777 word types
2020-09-01 11:14:19,843 : INFO : PROGRESS: at sentence #15270000, processed 836789289 words, keeping 10900763 word types
2020-09-01 11:14:20,059 : INFO :

iter 1 for moscow completed


2020-09-01 11:14:30,817 : INFO : PROGRESS: at sentence #15740000, processed 864723126 words, keeping 11118597 word types
2020-09-01 11:14:30,918 : INFO : PROGRESS: at sentence #15750000, processed 865004151 words, keeping 11126498 word types
2020-09-01 11:14:31,021 : INFO : PROGRESS: at sentence #15760000, processed 865276327 words, keeping 11134780 word types
2020-09-01 11:14:31,127 : INFO : PROGRESS: at sentence #15770000, processed 865557812 words, keeping 11142122 word types
2020-09-01 11:14:31,234 : INFO : PROGRESS: at sentence #15780000, processed 865844865 words, keeping 11149092 word types
2020-09-01 11:14:31,341 : INFO : PROGRESS: at sentence #15790000, processed 866133704 words, keeping 11156121 word types
2020-09-01 11:14:31,449 : INFO : PROGRESS: at sentence #15800000, processed 866423165 words, keeping 11162940 word types
2020-09-01 11:14:45,208 : INFO : PROGRESS: at sentence #15810000, processed 866739613 words, keeping 11170739 word types
2020-09-01 11:14:45,344 : INFO :

2020-09-01 11:14:53,334 : INFO : PROGRESS: at sentence #16420000, processed 887497297 words, keeping 11536056 word types
2020-09-01 11:14:53,451 : INFO : PROGRESS: at sentence #16430000, processed 887816980 words, keeping 11541013 word types
2020-09-01 11:14:53,592 : INFO : PROGRESS: at sentence #16440000, processed 888207618 words, keeping 11546439 word types
2020-09-01 11:14:53,706 : INFO : PROGRESS: at sentence #16450000, processed 888522905 words, keeping 11550879 word types
2020-09-01 11:14:53,839 : INFO : PROGRESS: at sentence #16460000, processed 888889882 words, keeping 11556269 word types
2020-09-01 11:14:53,936 : INFO : PROGRESS: at sentence #16470000, processed 889154495 words, keeping 11560928 word types
2020-09-01 11:14:54,048 : INFO : PROGRESS: at sentence #16480000, processed 889449280 words, keeping 11565562 word types
2020-09-01 11:14:54,159 : INFO : PROGRESS: at sentence #16490000, processed 889752716 words, keeping 11570124 word types
2020-09-01 11:14:54,291 : INFO :

2020-09-01 11:15:01,759 : INFO : PROGRESS: at sentence #17100000, processed 910515237 words, keeping 11857544 word types
2020-09-01 11:15:01,874 : INFO : PROGRESS: at sentence #17110000, processed 910825920 words, keeping 11861676 word types
2020-09-01 11:15:01,990 : INFO : PROGRESS: at sentence #17120000, processed 911137921 words, keeping 11865983 word types
2020-09-01 11:15:02,117 : INFO : PROGRESS: at sentence #17130000, processed 911486186 words, keeping 11870490 word types
2020-09-01 11:15:02,243 : INFO : PROGRESS: at sentence #17140000, processed 911835897 words, keeping 11874883 word types
2020-09-01 11:15:02,356 : INFO : PROGRESS: at sentence #17150000, processed 912139912 words, keeping 11879098 word types
2020-09-01 11:15:02,455 : INFO : PROGRESS: at sentence #17160000, processed 912408297 words, keeping 11882777 word types
2020-09-01 11:15:02,585 : INFO : PROGRESS: at sentence #17170000, processed 912762509 words, keeping 11887198 word types
2020-09-01 11:15:02,719 : INFO :

2020-09-01 11:15:10,116 : INFO : PROGRESS: at sentence #17780000, processed 933376612 words, keeping 12144161 word types
2020-09-01 11:15:10,258 : INFO : PROGRESS: at sentence #17790000, processed 933775026 words, keeping 12148366 word types
2020-09-01 11:15:10,379 : INFO : PROGRESS: at sentence #17800000, processed 934104315 words, keeping 12152516 word types
2020-09-01 11:15:10,491 : INFO : PROGRESS: at sentence #17810000, processed 934401176 words, keeping 12156378 word types
2020-09-01 11:15:10,612 : INFO : PROGRESS: at sentence #17820000, processed 934730859 words, keeping 12160343 word types
2020-09-01 11:15:10,744 : INFO : PROGRESS: at sentence #17830000, processed 935087380 words, keeping 12164612 word types
2020-09-01 11:15:10,851 : INFO : PROGRESS: at sentence #17840000, processed 935366643 words, keeping 12168386 word types
2020-09-01 11:15:10,966 : INFO : PROGRESS: at sentence #17850000, processed 935681770 words, keeping 12172318 word types
2020-09-01 11:15:11,090 : INFO :

2020-09-01 11:15:43,826 : INFO : PROGRESS: at sentence #18460000, processed 959511782 words, keeping 12464448 word types
2020-09-01 11:15:43,987 : INFO : PROGRESS: at sentence #18470000, processed 959970192 words, keeping 12469650 word types
2020-09-01 11:15:44,159 : INFO : PROGRESS: at sentence #18480000, processed 960456619 words, keeping 12475429 word types
2020-09-01 11:15:44,320 : INFO : PROGRESS: at sentence #18490000, processed 960912163 words, keeping 12480613 word types
2020-09-01 11:15:44,494 : INFO : PROGRESS: at sentence #18500000, processed 961408627 words, keeping 12486240 word types
2020-09-01 11:15:44,690 : INFO : PROGRESS: at sentence #18510000, processed 961958646 words, keeping 12492449 word types
2020-09-01 11:15:44,886 : INFO : PROGRESS: at sentence #18520000, processed 962523753 words, keeping 12498600 word types
2020-09-01 11:15:45,073 : INFO : PROGRESS: at sentence #18530000, processed 963056906 words, keeping 12504128 word types
2020-09-01 11:15:45,236 : INFO :

2020-09-01 11:15:54,786 : INFO : PROGRESS: at sentence #19140000, processed 990549143 words, keeping 12794677 word types
2020-09-01 11:15:54,954 : INFO : PROGRESS: at sentence #19150000, processed 991020438 words, keeping 12799207 word types
2020-09-01 11:15:55,101 : INFO : PROGRESS: at sentence #19160000, processed 991432116 words, keeping 12803656 word types
2020-09-01 11:15:55,262 : INFO : PROGRESS: at sentence #19170000, processed 991890984 words, keeping 12808149 word types
2020-09-01 11:15:55,457 : INFO : PROGRESS: at sentence #19180000, processed 992458934 words, keeping 12813287 word types
2020-09-01 11:15:55,622 : INFO : PROGRESS: at sentence #19190000, processed 992931485 words, keeping 12817646 word types
2020-09-01 11:15:55,788 : INFO : PROGRESS: at sentence #19200000, processed 993387209 words, keeping 12822402 word types
2020-09-01 11:15:55,931 : INFO : PROGRESS: at sentence #19210000, processed 993780938 words, keeping 12826636 word types
2020-09-01 11:15:56,081 : INFO :

2020-09-01 11:16:05,771 : INFO : PROGRESS: at sentence #19820000, processed 1021768133 words, keeping 13091205 word types
2020-09-01 11:16:05,939 : INFO : PROGRESS: at sentence #19830000, processed 1022248514 words, keeping 13095469 word types
2020-09-01 11:16:06,086 : INFO : PROGRESS: at sentence #19840000, processed 1022666466 words, keeping 13099233 word types
2020-09-01 11:16:06,244 : INFO : PROGRESS: at sentence #19850000, processed 1023118308 words, keeping 13103322 word types
2020-09-01 11:16:06,410 : INFO : PROGRESS: at sentence #19860000, processed 1023589361 words, keeping 13107473 word types
2020-09-01 11:16:06,595 : INFO : PROGRESS: at sentence #19870000, processed 1024129801 words, keeping 13112058 word types
2020-09-01 11:16:06,736 : INFO : PROGRESS: at sentence #19880000, processed 1024537371 words, keeping 13115735 word types
2020-09-01 11:16:06,879 : INFO : PROGRESS: at sentence #19890000, processed 1024940424 words, keeping 13119587 word types
2020-09-01 11:16:07,044 

2020-09-01 11:16:16,609 : INFO : PROGRESS: at sentence #20500000, processed 1052772157 words, keeping 13366502 word types
2020-09-01 11:16:16,793 : INFO : PROGRESS: at sentence #20510000, processed 1053308148 words, keeping 13370784 word types
2020-09-01 11:16:16,935 : INFO : PROGRESS: at sentence #20520000, processed 1053697974 words, keeping 13374527 word types
2020-09-01 11:16:17,094 : INFO : PROGRESS: at sentence #20530000, processed 1054153090 words, keeping 13378411 word types
2020-09-01 11:16:17,245 : INFO : PROGRESS: at sentence #20540000, processed 1054582382 words, keeping 13382179 word types
2020-09-01 11:16:17,416 : INFO : PROGRESS: at sentence #20550000, processed 1055071524 words, keeping 13386407 word types
2020-09-01 11:16:17,605 : INFO : PROGRESS: at sentence #20560000, processed 1055621956 words, keeping 13390722 word types
2020-09-01 11:16:17,765 : INFO : PROGRESS: at sentence #20570000, processed 1056080403 words, keeping 13394587 word types
2020-09-01 11:16:17,903 

2020-09-01 11:16:27,739 : INFO : PROGRESS: at sentence #21180000, processed 1083934377 words, keeping 13628931 word types
2020-09-01 11:16:27,888 : INFO : PROGRESS: at sentence #21190000, processed 1084354485 words, keeping 13632723 word types
2020-09-01 11:16:28,030 : INFO : PROGRESS: at sentence #21200000, processed 1084755506 words, keeping 13636213 word types
2020-09-01 11:16:28,194 : INFO : PROGRESS: at sentence #21210000, processed 1085211497 words, keeping 13640107 word types
2020-09-01 11:16:28,358 : INFO : PROGRESS: at sentence #21220000, processed 1085667627 words, keeping 13643790 word types
2020-09-01 11:16:28,496 : INFO : PROGRESS: at sentence #21230000, processed 1086047976 words, keeping 13647532 word types
2020-09-01 11:16:28,650 : INFO : PROGRESS: at sentence #21240000, processed 1086479929 words, keeping 13651049 word types
2020-09-01 11:16:28,788 : INFO : PROGRESS: at sentence #21250000, processed 1086861056 words, keeping 13654729 word types
2020-09-01 11:16:28,918 

2020-09-01 11:17:20,329 : INFO : PROGRESS: at sentence #21860000, processed 1117834389 words, keeping 13934494 word types
2020-09-01 11:17:20,494 : INFO : PROGRESS: at sentence #21870000, processed 1118297095 words, keeping 13938883 word types
2020-09-01 11:17:20,654 : INFO : PROGRESS: at sentence #21880000, processed 1118755107 words, keeping 13943565 word types
2020-09-01 11:17:20,853 : INFO : PROGRESS: at sentence #21890000, processed 1119324029 words, keeping 13948251 word types
2020-09-01 11:17:21,062 : INFO : PROGRESS: at sentence #21900000, processed 1119928536 words, keeping 13953826 word types
2020-09-01 11:17:21,248 : INFO : PROGRESS: at sentence #21910000, processed 1120466151 words, keeping 13959356 word types
2020-09-01 11:17:21,412 : INFO : PROGRESS: at sentence #21920000, processed 1120921861 words, keeping 13964035 word types
2020-09-01 11:17:21,561 : INFO : PROGRESS: at sentence #21930000, processed 1121341238 words, keeping 13967606 word types
2020-09-01 11:17:21,780 

2020-09-01 11:17:33,358 : INFO : PROGRESS: at sentence #22540000, processed 1155350863 words, keeping 14252752 word types
2020-09-01 11:17:33,559 : INFO : PROGRESS: at sentence #22550000, processed 1155933865 words, keeping 14257184 word types
2020-09-01 11:17:33,761 : INFO : PROGRESS: at sentence #22560000, processed 1156527771 words, keeping 14261807 word types
2020-09-01 11:17:33,938 : INFO : PROGRESS: at sentence #22570000, processed 1157044903 words, keeping 14266141 word types
2020-09-01 11:17:34,141 : INFO : PROGRESS: at sentence #22580000, processed 1157627357 words, keeping 14270901 word types
2020-09-01 11:17:34,332 : INFO : PROGRESS: at sentence #22590000, processed 1158181780 words, keeping 14275309 word types
2020-09-01 11:17:34,528 : INFO : PROGRESS: at sentence #22600000, processed 1158742251 words, keeping 14279694 word types
2020-09-01 11:17:34,725 : INFO : PROGRESS: at sentence #22610000, processed 1159313583 words, keeping 14284113 word types
2020-09-01 11:17:34,895 

2020-09-01 11:17:46,243 : INFO : PROGRESS: at sentence #23220000, processed 1192743163 words, keeping 14542354 word types
2020-09-01 11:17:46,448 : INFO : PROGRESS: at sentence #23230000, processed 1193338237 words, keeping 14546418 word types
2020-09-01 11:17:46,647 : INFO : PROGRESS: at sentence #23240000, processed 1193920919 words, keeping 14550711 word types
2020-09-01 11:17:46,819 : INFO : PROGRESS: at sentence #23250000, processed 1194426762 words, keeping 14554518 word types
2020-09-01 11:17:46,968 : INFO : PROGRESS: at sentence #23260000, processed 1194852037 words, keeping 14558147 word types
2020-09-01 11:17:47,163 : INFO : PROGRESS: at sentence #23270000, processed 1195423126 words, keeping 14562157 word types
2020-09-01 11:17:47,352 : INFO : PROGRESS: at sentence #23280000, processed 1195976110 words, keeping 14566085 word types
2020-09-01 11:17:47,542 : INFO : PROGRESS: at sentence #23290000, processed 1196522765 words, keeping 14569959 word types
2020-09-01 11:17:47,698 

2020-09-01 11:17:58,485 : INFO : PROGRESS: at sentence #23900000, processed 1228282023 words, keeping 14803212 word types
2020-09-01 11:17:58,667 : INFO : PROGRESS: at sentence #23910000, processed 1228806249 words, keeping 14807240 word types
2020-09-01 11:17:58,866 : INFO : PROGRESS: at sentence #23920000, processed 1229378224 words, keeping 14811396 word types
2020-09-01 11:17:59,058 : INFO : PROGRESS: at sentence #23930000, processed 1229927250 words, keeping 14815594 word types
2020-09-01 11:17:59,209 : INFO : PROGRESS: at sentence #23940000, processed 1230353665 words, keeping 14818971 word types
2020-09-01 11:17:59,366 : INFO : PROGRESS: at sentence #23950000, processed 1230783695 words, keeping 14822478 word types
2020-09-01 11:17:59,559 : INFO : PROGRESS: at sentence #23960000, processed 1231330573 words, keeping 14826228 word types
2020-09-01 11:17:59,762 : INFO : PROGRESS: at sentence #23970000, processed 1231909969 words, keeping 14830246 word types
2020-09-01 11:17:59,956 

2020-09-01 11:18:11,622 : INFO : PROGRESS: at sentence #24580000, processed 1265532250 words, keeping 15061140 word types
2020-09-01 11:18:11,816 : INFO : PROGRESS: at sentence #24590000, processed 1266094840 words, keeping 15064909 word types
2020-09-01 11:18:11,998 : INFO : PROGRESS: at sentence #24600000, processed 1266611516 words, keeping 15068368 word types
2020-09-01 11:18:12,202 : INFO : PROGRESS: at sentence #24610000, processed 1267195974 words, keeping 15071999 word types
2020-09-01 11:18:12,364 : INFO : PROGRESS: at sentence #24620000, processed 1267645808 words, keeping 15075162 word types
2020-09-01 11:18:12,565 : INFO : PROGRESS: at sentence #24630000, processed 1268221349 words, keeping 15078964 word types
2020-09-01 11:18:12,758 : INFO : PROGRESS: at sentence #24640000, processed 1268782887 words, keeping 15082776 word types
2020-09-01 11:18:12,943 : INFO : PROGRESS: at sentence #24650000, processed 1269316051 words, keeping 15086331 word types
2020-09-01 11:18:13,127 

2020-09-01 11:18:24,745 : INFO : PROGRESS: at sentence #25260000, processed 1303236053 words, keeping 15314386 word types
2020-09-01 11:18:24,958 : INFO : PROGRESS: at sentence #25270000, processed 1303853268 words, keeping 15318240 word types
2020-09-01 11:18:25,181 : INFO : PROGRESS: at sentence #25280000, processed 1304497359 words, keeping 15322545 word types
2020-09-01 11:18:25,382 : INFO : PROGRESS: at sentence #25290000, processed 1305084363 words, keeping 15326379 word types
2020-09-01 11:18:25,593 : INFO : PROGRESS: at sentence #25300000, processed 1305690512 words, keeping 15330529 word types
2020-09-01 11:18:25,800 : INFO : PROGRESS: at sentence #25310000, processed 1306281437 words, keeping 15334315 word types
2020-09-01 11:18:25,991 : INFO : PROGRESS: at sentence #25320000, processed 1306839014 words, keeping 15338014 word types
2020-09-01 11:18:26,194 : INFO : PROGRESS: at sentence #25330000, processed 1307427321 words, keeping 15341727 word types
2020-09-01 11:18:26,391 

2020-09-01 11:18:37,909 : INFO : PROGRESS: at sentence #25940000, processed 1341180318 words, keeping 15559876 word types
2020-09-01 11:18:38,096 : INFO : PROGRESS: at sentence #25950000, processed 1341711835 words, keeping 15563510 word types
2020-09-01 11:18:38,294 : INFO : PROGRESS: at sentence #25960000, processed 1342277434 words, keeping 15566940 word types
2020-09-01 11:18:38,492 : INFO : PROGRESS: at sentence #25970000, processed 1342843002 words, keeping 15570388 word types
2020-09-01 11:18:38,693 : INFO : PROGRESS: at sentence #25980000, processed 1343420680 words, keeping 15574012 word types
2020-09-01 11:18:38,894 : INFO : PROGRESS: at sentence #25990000, processed 1344012996 words, keeping 15577835 word types
2020-09-01 11:18:39,074 : INFO : PROGRESS: at sentence #26000000, processed 1344537139 words, keeping 15581160 word types
2020-09-01 11:18:39,263 : INFO : PROGRESS: at sentence #26010000, processed 1345079343 words, keeping 15584711 word types
2020-09-01 11:18:39,414 

2020-09-01 11:19:10,434 : INFO : PROGRESS: at sentence #26620000, processed 1379579122 words, keeping 15826200 word types
2020-09-01 11:19:10,626 : INFO : PROGRESS: at sentence #26630000, processed 1380129544 words, keeping 15830633 word types
2020-09-01 11:19:10,851 : INFO : PROGRESS: at sentence #26640000, processed 1380764457 words, keeping 15834561 word types
2020-09-01 11:19:11,085 : INFO : PROGRESS: at sentence #26650000, processed 1381450969 words, keeping 15839130 word types
2020-09-01 11:19:11,292 : INFO : PROGRESS: at sentence #26660000, processed 1382050004 words, keeping 15843266 word types
2020-09-01 11:19:11,516 : INFO : PROGRESS: at sentence #26670000, processed 1382712521 words, keeping 15848246 word types
2020-09-01 11:19:11,780 : INFO : PROGRESS: at sentence #26680000, processed 1383484696 words, keeping 15853225 word types
2020-09-01 11:19:11,975 : INFO : PROGRESS: at sentence #26690000, processed 1384044869 words, keeping 15857100 word types
2020-09-01 11:19:12,191 

2020-09-01 11:19:24,861 : INFO : PROGRESS: at sentence #27300000, processed 1422212776 words, keeping 16121057 word types
2020-09-01 11:19:25,090 : INFO : PROGRESS: at sentence #27310000, processed 1422908399 words, keeping 16125653 word types
2020-09-01 11:19:25,306 : INFO : PROGRESS: at sentence #27320000, processed 1423561842 words, keeping 16130322 word types
2020-09-01 11:19:25,524 : INFO : PROGRESS: at sentence #27330000, processed 1424221896 words, keeping 16135106 word types
2020-09-01 11:19:25,719 : INFO : PROGRESS: at sentence #27340000, processed 1424814259 words, keeping 16139488 word types
2020-09-01 11:19:25,937 : INFO : PROGRESS: at sentence #27350000, processed 1425443417 words, keeping 16143341 word types
2020-09-01 11:19:26,157 : INFO : PROGRESS: at sentence #27360000, processed 1426096946 words, keeping 16147684 word types
2020-09-01 11:19:26,363 : INFO : PROGRESS: at sentence #27370000, processed 1426697450 words, keeping 16151531 word types
2020-09-01 11:19:26,584 

2020-09-01 11:19:39,210 : INFO : PROGRESS: at sentence #27980000, processed 1464136603 words, keeping 16384673 word types
2020-09-01 11:19:39,436 : INFO : PROGRESS: at sentence #27990000, processed 1464793429 words, keeping 16388496 word types
2020-09-01 11:19:39,656 : INFO : PROGRESS: at sentence #28000000, processed 1465430248 words, keeping 16392048 word types
2020-09-01 11:19:39,839 : INFO : PROGRESS: at sentence #28010000, processed 1465952018 words, keeping 16395372 word types
2020-09-01 11:19:40,067 : INFO : PROGRESS: at sentence #28020000, processed 1466611380 words, keeping 16399327 word types
2020-09-01 11:19:40,296 : INFO : PROGRESS: at sentence #28030000, processed 1467283131 words, keeping 16403186 word types
2020-09-01 11:19:40,534 : INFO : PROGRESS: at sentence #28040000, processed 1467997527 words, keeping 16407711 word types
2020-09-01 11:19:40,764 : INFO : PROGRESS: at sentence #28050000, processed 1468664009 words, keeping 16411544 word types
2020-09-01 11:19:40,998 

iter 1 for spb completed

iter 1 completed



2020-09-01 11:19:57,807 : INFO : effective_min_count=10 retains 1705201 unique words (10% of original 16493356, drops 14788155)
2020-09-01 11:19:57,808 : INFO : effective_min_count=10 leaves 1455500579 word corpus (98% of original 1482128485, drops 26627906)
2020-09-01 11:20:02,663 : INFO : deleting the raw counts dictionary of 16493356 items
2020-09-01 11:20:03,225 : INFO : sample=0.001 downsamples 33 most-common words
2020-09-01 11:20:03,226 : INFO : downsampling leaves estimated 1287349383 word corpus (88.4% of prior 1455500579)
2020-09-01 11:20:41,237 : INFO : estimated required memory for 1705201 words, 1896347 buckets and 300 dimensions: 7897480828 bytes
2020-09-01 11:20:41,343 : INFO : resetting layer weights



corpus count: 28276058; vocab: 1705201
end of building


In [6]:
print(len(model.wv.vocab))

1705201


In [None]:
print('start training\n')
model.train(sentences=MyCorpus(), total_examples=total_examples, epochs=40)
print(f'\nend of training')

2020-09-01 11:30:56,036 : INFO : training model with 20 workers on 1705201 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=20 window=15


start training



2020-09-01 11:31:36,440 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 213 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:31:37,532 : INFO : EPOCH 1 - PROGRESS: at 0.01% examples, 2712 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:31:38,701 : INFO : EPOCH 1 - PROGRESS: at 0.02% examples, 5699 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:31:39,769 : INFO : EPOCH 1 - PROGRESS: at 0.03% examples, 7747 words/s, in_qsize 38, out_qsize 3
2020-09-01 11:31:40,814 : INFO : EPOCH 1 - PROGRESS: at 0.04% examples, 10661 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:31:42,026 : INFO : EPOCH 1 - PROGRESS: at 0.05% examples, 12451 words/s, in_qsize 37, out_qsize 4
2020-09-01 11:31:43,118 : INFO : EPOCH 1 - PROGRESS: at 0.06% examples, 15470 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:31:44,262 : INFO : EPOCH 1 - PROGRESS: at 0.07% examples, 16912 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:31:45,305 : INFO : EPOCH 1 - PROGRESS: at 0.08% examples, 19372 words/s, in_qsize 39, out_qsize

2020-09-01 11:32:56,751 : INFO : EPOCH 1 - PROGRESS: at 0.75% examples, 72828 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:32:57,880 : INFO : EPOCH 1 - PROGRESS: at 0.76% examples, 73006 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:32:58,888 : INFO : EPOCH 1 - PROGRESS: at 0.77% examples, 73534 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:32:59,933 : INFO : EPOCH 1 - PROGRESS: at 0.78% examples, 73681 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:33:00,962 : INFO : EPOCH 1 - PROGRESS: at 0.79% examples, 74045 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:33:02,241 : INFO : EPOCH 1 - PROGRESS: at 0.80% examples, 74255 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:33:03,363 : INFO : EPOCH 1 - PROGRESS: at 0.82% examples, 74822 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:33:04,399 : INFO : EPOCH 1 - PROGRESS: at 0.82% examples, 75031 words/s, in_qsize 40, out_qsize 1
2020-09-01 11:33:05,437 : INFO : EPOCH 1 - PROGRESS: at 0.84% examples, 75506 words/s, in_qsize 39, out_

2020-09-01 11:34:17,208 : INFO : EPOCH 1 - PROGRESS: at 1.53% examples, 88888 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:18,276 : INFO : EPOCH 1 - PROGRESS: at 1.54% examples, 89061 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:19,343 : INFO : EPOCH 1 - PROGRESS: at 1.55% examples, 89106 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:20,386 : INFO : EPOCH 1 - PROGRESS: at 1.56% examples, 89288 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:21,411 : INFO : EPOCH 1 - PROGRESS: at 1.57% examples, 89306 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:22,504 : INFO : EPOCH 1 - PROGRESS: at 1.58% examples, 89544 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:23,571 : INFO : EPOCH 1 - PROGRESS: at 1.59% examples, 89543 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:34:24,600 : INFO : EPOCH 1 - PROGRESS: at 1.60% examples, 89727 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:34:25,642 : INFO : EPOCH 1 - PROGRESS: at 1.61% examples, 89819 words/s, in_qsize 39, out_

2020-09-01 11:35:37,541 : INFO : EPOCH 1 - PROGRESS: at 2.22% examples, 93765 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:38,555 : INFO : EPOCH 1 - PROGRESS: at 2.22% examples, 93736 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:39,680 : INFO : EPOCH 1 - PROGRESS: at 2.23% examples, 93702 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:40,693 : INFO : EPOCH 1 - PROGRESS: at 2.23% examples, 93582 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:35:41,722 : INFO : EPOCH 1 - PROGRESS: at 2.24% examples, 93610 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:35:42,807 : INFO : EPOCH 1 - PROGRESS: at 2.25% examples, 93558 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:44,079 : INFO : EPOCH 1 - PROGRESS: at 2.26% examples, 93507 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:45,098 : INFO : EPOCH 1 - PROGRESS: at 2.26% examples, 93417 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:35:46,191 : INFO : EPOCH 1 - PROGRESS: at 2.27% examples, 93422 words/s, in_qsize 40, out_

2020-09-01 11:36:59,694 : INFO : EPOCH 1 - PROGRESS: at 2.81% examples, 90933 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:37:00,977 : INFO : EPOCH 1 - PROGRESS: at 2.81% examples, 90920 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:37:02,135 : INFO : EPOCH 1 - PROGRESS: at 2.82% examples, 90823 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:37:03,249 : INFO : EPOCH 1 - PROGRESS: at 2.83% examples, 90855 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:37:04,343 : INFO : EPOCH 1 - PROGRESS: at 2.83% examples, 90774 words/s, in_qsize 39, out_qsize 1
2020-09-01 11:37:05,350 : INFO : EPOCH 1 - PROGRESS: at 2.84% examples, 90810 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:37:06,410 : INFO : EPOCH 1 - PROGRESS: at 2.85% examples, 90736 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:37:08,055 : INFO : EPOCH 1 - PROGRESS: at 2.86% examples, 90637 words/s, in_qsize 35, out_qsize 4
2020-09-01 11:37:09,247 : INFO : EPOCH 1 - PROGRESS: at 2.87% examples, 90790 words/s, in_qsize 38, out_

2020-09-01 11:38:19,747 : INFO : EPOCH 1 - PROGRESS: at 3.58% examples, 94584 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:20,779 : INFO : EPOCH 1 - PROGRESS: at 3.59% examples, 94676 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:21,794 : INFO : EPOCH 1 - PROGRESS: at 3.60% examples, 94634 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:38:22,997 : INFO : EPOCH 1 - PROGRESS: at 3.61% examples, 94709 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:38:24,230 : INFO : EPOCH 1 - PROGRESS: at 3.62% examples, 94739 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:25,514 : INFO : EPOCH 1 - PROGRESS: at 3.63% examples, 94815 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:26,658 : INFO : EPOCH 1 - PROGRESS: at 3.64% examples, 94825 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:27,905 : INFO : EPOCH 1 - PROGRESS: at 3.65% examples, 94928 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:38:28,949 : INFO : EPOCH 1 - PROGRESS: at 3.66% examples, 94958 words/s, in_qsize 39, out_

2020-09-01 11:39:40,873 : INFO : EPOCH 1 - PROGRESS: at 4.32% examples, 97170 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:39:41,890 : INFO : EPOCH 1 - PROGRESS: at 4.33% examples, 97230 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:42,979 : INFO : EPOCH 1 - PROGRESS: at 4.35% examples, 97243 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:43,999 : INFO : EPOCH 1 - PROGRESS: at 4.36% examples, 97302 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:45,024 : INFO : EPOCH 1 - PROGRESS: at 4.37% examples, 97311 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:46,056 : INFO : EPOCH 1 - PROGRESS: at 4.38% examples, 97351 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:47,132 : INFO : EPOCH 1 - PROGRESS: at 4.39% examples, 97367 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:48,298 : INFO : EPOCH 1 - PROGRESS: at 4.40% examples, 97414 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:39:49,346 : INFO : EPOCH 1 - PROGRESS: at 4.41% examples, 97419 words/s, in_qsize 39, out_

2020-09-01 11:41:01,170 : INFO : EPOCH 1 - PROGRESS: at 5.08% examples, 99074 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:02,192 : INFO : EPOCH 1 - PROGRESS: at 5.09% examples, 99079 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:03,200 : INFO : EPOCH 1 - PROGRESS: at 5.10% examples, 99099 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:41:04,346 : INFO : EPOCH 1 - PROGRESS: at 5.11% examples, 99126 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:05,387 : INFO : EPOCH 1 - PROGRESS: at 5.12% examples, 99156 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:41:06,441 : INFO : EPOCH 1 - PROGRESS: at 5.14% examples, 99197 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:07,487 : INFO : EPOCH 1 - PROGRESS: at 5.14% examples, 99183 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:08,598 : INFO : EPOCH 1 - PROGRESS: at 5.15% examples, 99215 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:41:09,652 : INFO : EPOCH 1 - PROGRESS: at 5.17% examples, 99257 words/s, in_qsize 40, out_

2020-09-01 11:42:19,254 : INFO : EPOCH 1 - PROGRESS: at 5.84% examples, 100749 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:42:20,324 : INFO : EPOCH 1 - PROGRESS: at 5.85% examples, 100782 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:42:21,345 : INFO : EPOCH 1 - PROGRESS: at 5.86% examples, 100771 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:42:22,551 : INFO : EPOCH 1 - PROGRESS: at 5.87% examples, 100796 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:42:23,635 : INFO : EPOCH 1 - PROGRESS: at 5.88% examples, 100813 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:42:24,801 : INFO : EPOCH 1 - PROGRESS: at 5.89% examples, 100819 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:42:25,831 : INFO : EPOCH 1 - PROGRESS: at 5.89% examples, 100819 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:42:27,041 : INFO : EPOCH 1 - PROGRESS: at 5.90% examples, 100817 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:42:28,054 : INFO : EPOCH 1 - PROGRESS: at 5.91% examples, 100870 words/s, in_qsize

2020-09-01 11:43:37,905 : INFO : EPOCH 1 - PROGRESS: at 6.59% examples, 102085 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:38,968 : INFO : EPOCH 1 - PROGRESS: at 6.60% examples, 102080 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:39,976 : INFO : EPOCH 1 - PROGRESS: at 6.61% examples, 102036 words/s, in_qsize 37, out_qsize 3
2020-09-01 11:43:41,264 : INFO : EPOCH 1 - PROGRESS: at 6.62% examples, 102068 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:43:42,281 : INFO : EPOCH 1 - PROGRESS: at 6.63% examples, 102126 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:43,449 : INFO : EPOCH 1 - PROGRESS: at 6.64% examples, 102119 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:44,504 : INFO : EPOCH 1 - PROGRESS: at 6.65% examples, 102147 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:45,509 : INFO : EPOCH 1 - PROGRESS: at 6.66% examples, 102150 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:43:46,610 : INFO : EPOCH 1 - PROGRESS: at 6.67% examples, 102173 words/s, in_qsize

2020-09-01 11:44:56,977 : INFO : EPOCH 1 - PROGRESS: at 7.35% examples, 103107 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:44:58,070 : INFO : EPOCH 1 - PROGRESS: at 7.36% examples, 103076 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:44:59,080 : INFO : EPOCH 1 - PROGRESS: at 7.37% examples, 103127 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:45:00,267 : INFO : EPOCH 1 - PROGRESS: at 7.38% examples, 103105 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:45:01,291 : INFO : EPOCH 1 - PROGRESS: at 7.39% examples, 103155 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:45:02,440 : INFO : EPOCH 1 - PROGRESS: at 7.40% examples, 103127 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:45:03,478 : INFO : EPOCH 1 - PROGRESS: at 7.41% examples, 103164 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:45:04,522 : INFO : EPOCH 1 - PROGRESS: at 7.41% examples, 103150 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:45:05,595 : INFO : EPOCH 1 - PROGRESS: at 7.42% examples, 103182 words/s, in_qsize

2020-09-01 11:46:16,478 : INFO : EPOCH 1 - PROGRESS: at 8.07% examples, 103911 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:46:17,520 : INFO : EPOCH 1 - PROGRESS: at 8.08% examples, 103908 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:18,771 : INFO : EPOCH 1 - PROGRESS: at 8.09% examples, 103946 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:19,788 : INFO : EPOCH 1 - PROGRESS: at 8.10% examples, 103953 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:46:20,889 : INFO : EPOCH 1 - PROGRESS: at 8.11% examples, 103980 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:22,027 : INFO : EPOCH 1 - PROGRESS: at 8.12% examples, 103974 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:23,115 : INFO : EPOCH 1 - PROGRESS: at 8.13% examples, 104002 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:24,131 : INFO : EPOCH 1 - PROGRESS: at 8.14% examples, 104000 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:46:25,181 : INFO : EPOCH 1 - PROGRESS: at 8.15% examples, 104031 words/s, in_qsize

2020-09-01 11:47:35,055 : INFO : EPOCH 1 - PROGRESS: at 8.81% examples, 104821 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:36,149 : INFO : EPOCH 1 - PROGRESS: at 8.82% examples, 104836 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:37,190 : INFO : EPOCH 1 - PROGRESS: at 8.84% examples, 104875 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:38,204 : INFO : EPOCH 1 - PROGRESS: at 8.84% examples, 104873 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:39,229 : INFO : EPOCH 1 - PROGRESS: at 8.86% examples, 104904 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:40,266 : INFO : EPOCH 1 - PROGRESS: at 8.86% examples, 104891 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:41,393 : INFO : EPOCH 1 - PROGRESS: at 8.87% examples, 104912 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:47:42,417 : INFO : EPOCH 1 - PROGRESS: at 8.88% examples, 104917 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:47:43,422 : INFO : EPOCH 1 - PROGRESS: at 8.89% examples, 104933 words/s, in_qsize

2020-09-01 11:48:52,885 : INFO : EPOCH 1 - PROGRESS: at 9.59% examples, 105641 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:48:53,959 : INFO : EPOCH 1 - PROGRESS: at 9.60% examples, 105656 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:48:55,127 : INFO : EPOCH 1 - PROGRESS: at 9.61% examples, 105662 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:48:56,128 : INFO : EPOCH 1 - PROGRESS: at 9.62% examples, 105660 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:48:57,200 : INFO : EPOCH 1 - PROGRESS: at 9.63% examples, 105675 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:48:58,242 : INFO : EPOCH 1 - PROGRESS: at 9.64% examples, 105685 words/s, in_qsize 40, out_qsize 1
2020-09-01 11:48:59,272 : INFO : EPOCH 1 - PROGRESS: at 9.65% examples, 105697 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:49:00,342 : INFO : EPOCH 1 - PROGRESS: at 9.66% examples, 105729 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:49:01,378 : INFO : EPOCH 1 - PROGRESS: at 9.67% examples, 105700 words/s, in_qsize

2020-09-01 11:50:11,316 : INFO : EPOCH 1 - PROGRESS: at 10.38% examples, 106365 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:12,352 : INFO : EPOCH 1 - PROGRESS: at 10.40% examples, 106397 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:13,399 : INFO : EPOCH 1 - PROGRESS: at 10.41% examples, 106390 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:14,422 : INFO : EPOCH 1 - PROGRESS: at 10.42% examples, 106393 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:15,521 : INFO : EPOCH 1 - PROGRESS: at 10.43% examples, 106405 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:16,577 : INFO : EPOCH 1 - PROGRESS: at 10.44% examples, 106413 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:17,624 : INFO : EPOCH 1 - PROGRESS: at 10.44% examples, 106414 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:18,774 : INFO : EPOCH 1 - PROGRESS: at 10.45% examples, 106405 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:50:19,834 : INFO : EPOCH 1 - PROGRESS: at 10.46% examples, 106435 words/s,

2020-09-01 11:51:28,911 : INFO : EPOCH 1 - PROGRESS: at 11.10% examples, 106879 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:51:29,927 : INFO : EPOCH 1 - PROGRESS: at 11.11% examples, 106917 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:51:31,105 : INFO : EPOCH 1 - PROGRESS: at 11.12% examples, 106892 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:51:32,129 : INFO : EPOCH 1 - PROGRESS: at 11.13% examples, 106888 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:51:33,138 : INFO : EPOCH 1 - PROGRESS: at 11.14% examples, 106927 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:51:34,151 : INFO : EPOCH 1 - PROGRESS: at 11.15% examples, 106910 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:51:35,202 : INFO : EPOCH 1 - PROGRESS: at 11.16% examples, 106923 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:51:36,228 : INFO : EPOCH 1 - PROGRESS: at 11.17% examples, 106919 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:51:37,257 : INFO : EPOCH 1 - PROGRESS: at 11.18% examples, 106942 words/s,

2020-09-01 11:52:45,842 : INFO : EPOCH 1 - PROGRESS: at 11.83% examples, 107364 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:46,845 : INFO : EPOCH 1 - PROGRESS: at 11.84% examples, 107387 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:47,854 : INFO : EPOCH 1 - PROGRESS: at 11.84% examples, 107364 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:52:48,888 : INFO : EPOCH 1 - PROGRESS: at 11.86% examples, 107397 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:49,925 : INFO : EPOCH 1 - PROGRESS: at 11.87% examples, 107378 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:50,952 : INFO : EPOCH 1 - PROGRESS: at 11.88% examples, 107411 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:51,954 : INFO : EPOCH 1 - PROGRESS: at 11.89% examples, 107414 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:52,997 : INFO : EPOCH 1 - PROGRESS: at 11.90% examples, 107421 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:52:54,047 : INFO : EPOCH 1 - PROGRESS: at 11.91% examples, 107415 words/s,

2020-09-01 11:54:03,299 : INFO : EPOCH 1 - PROGRESS: at 12.57% examples, 107733 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:54:04,344 : INFO : EPOCH 1 - PROGRESS: at 12.58% examples, 107752 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:05,386 : INFO : EPOCH 1 - PROGRESS: at 12.59% examples, 107740 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:06,499 : INFO : EPOCH 1 - PROGRESS: at 12.60% examples, 107753 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:07,510 : INFO : EPOCH 1 - PROGRESS: at 12.61% examples, 107749 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:08,521 : INFO : EPOCH 1 - PROGRESS: at 12.62% examples, 107777 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:09,530 : INFO : EPOCH 1 - PROGRESS: at 12.63% examples, 107767 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:10,579 : INFO : EPOCH 1 - PROGRESS: at 12.64% examples, 107792 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:54:11,625 : INFO : EPOCH 1 - PROGRESS: at 12.65% examples, 107785 words/s,

2020-09-01 11:55:22,650 : INFO : EPOCH 1 - PROGRESS: at 13.29% examples, 108104 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:23,729 : INFO : EPOCH 1 - PROGRESS: at 13.31% examples, 108112 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:24,768 : INFO : EPOCH 1 - PROGRESS: at 13.32% examples, 108112 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:25,802 : INFO : EPOCH 1 - PROGRESS: at 13.32% examples, 108119 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:26,810 : INFO : EPOCH 1 - PROGRESS: at 13.33% examples, 108127 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:27,883 : INFO : EPOCH 1 - PROGRESS: at 13.34% examples, 108119 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:28,886 : INFO : EPOCH 1 - PROGRESS: at 13.35% examples, 108139 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:29,970 : INFO : EPOCH 1 - PROGRESS: at 13.35% examples, 108107 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:55:31,175 : INFO : EPOCH 1 - PROGRESS: at 13.37% examples, 108125 words/s,

2020-09-01 11:56:40,137 : INFO : EPOCH 1 - PROGRESS: at 14.02% examples, 108484 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:41,235 : INFO : EPOCH 1 - PROGRESS: at 14.03% examples, 108480 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:42,247 : INFO : EPOCH 1 - PROGRESS: at 14.04% examples, 108498 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:43,254 : INFO : EPOCH 1 - PROGRESS: at 14.04% examples, 108489 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:44,331 : INFO : EPOCH 1 - PROGRESS: at 14.06% examples, 108503 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:45,351 : INFO : EPOCH 1 - PROGRESS: at 14.06% examples, 108498 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:56:46,374 : INFO : EPOCH 1 - PROGRESS: at 14.07% examples, 108517 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:47,425 : INFO : EPOCH 1 - PROGRESS: at 14.08% examples, 108516 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:56:48,447 : INFO : EPOCH 1 - PROGRESS: at 14.09% examples, 108528 words/s,

2020-09-01 11:57:59,019 : INFO : EPOCH 1 - PROGRESS: at 14.78% examples, 108858 words/s, in_qsize 38, out_qsize 1
2020-09-01 11:58:00,036 : INFO : EPOCH 1 - PROGRESS: at 14.78% examples, 108838 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:01,205 : INFO : EPOCH 1 - PROGRESS: at 14.80% examples, 108861 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:02,242 : INFO : EPOCH 1 - PROGRESS: at 14.81% examples, 108851 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:03,384 : INFO : EPOCH 1 - PROGRESS: at 14.82% examples, 108870 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:04,468 : INFO : EPOCH 1 - PROGRESS: at 14.83% examples, 108862 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:05,481 : INFO : EPOCH 1 - PROGRESS: at 14.84% examples, 108885 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:06,523 : INFO : EPOCH 1 - PROGRESS: at 14.85% examples, 108874 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:58:07,545 : INFO : EPOCH 1 - PROGRESS: at 14.87% examples, 108896 words/s,

2020-09-01 11:59:16,882 : INFO : EPOCH 1 - PROGRESS: at 15.55% examples, 109204 words/s, in_qsize 40, out_qsize 0
2020-09-01 11:59:17,909 : INFO : EPOCH 1 - PROGRESS: at 15.56% examples, 109174 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:18,915 : INFO : EPOCH 1 - PROGRESS: at 15.57% examples, 109196 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:20,009 : INFO : EPOCH 1 - PROGRESS: at 15.58% examples, 109182 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:21,020 : INFO : EPOCH 1 - PROGRESS: at 15.59% examples, 109203 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:22,027 : INFO : EPOCH 1 - PROGRESS: at 15.60% examples, 109200 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:23,033 : INFO : EPOCH 1 - PROGRESS: at 15.61% examples, 109206 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:24,129 : INFO : EPOCH 1 - PROGRESS: at 15.62% examples, 109197 words/s, in_qsize 39, out_qsize 0
2020-09-01 11:59:25,194 : INFO : EPOCH 1 - PROGRESS: at 15.63% examples, 109210 words/s,

2020-09-01 12:00:35,293 : INFO : EPOCH 1 - PROGRESS: at 16.31% examples, 109533 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:36,444 : INFO : EPOCH 1 - PROGRESS: at 16.32% examples, 109530 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:37,450 : INFO : EPOCH 1 - PROGRESS: at 16.33% examples, 109551 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:38,524 : INFO : EPOCH 1 - PROGRESS: at 16.33% examples, 109538 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:39,539 : INFO : EPOCH 1 - PROGRESS: at 16.34% examples, 109549 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:40,612 : INFO : EPOCH 1 - PROGRESS: at 16.35% examples, 109541 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:41,645 : INFO : EPOCH 1 - PROGRESS: at 16.36% examples, 109556 words/s, in_qsize 38, out_qsize 1
2020-09-01 12:00:42,709 : INFO : EPOCH 1 - PROGRESS: at 16.37% examples, 109563 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:00:43,776 : INFO : EPOCH 1 - PROGRESS: at 16.39% examples, 109575 words/s,

2020-09-01 12:01:52,062 : INFO : EPOCH 1 - PROGRESS: at 17.06% examples, 109826 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:01:53,091 : INFO : EPOCH 1 - PROGRESS: at 17.06% examples, 109802 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:01:54,147 : INFO : EPOCH 1 - PROGRESS: at 17.07% examples, 109815 words/s, in_qsize 40, out_qsize 1
2020-09-01 12:01:55,274 : INFO : EPOCH 1 - PROGRESS: at 17.08% examples, 109804 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:01:56,349 : INFO : EPOCH 1 - PROGRESS: at 17.10% examples, 109829 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:01:57,395 : INFO : EPOCH 1 - PROGRESS: at 17.10% examples, 109824 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:01:58,432 : INFO : EPOCH 1 - PROGRESS: at 17.12% examples, 109842 words/s, in_qsize 40, out_qsize 1
2020-09-01 12:01:59,485 : INFO : EPOCH 1 - PROGRESS: at 17.13% examples, 109821 words/s, in_qsize 40, out_qsize 1
2020-09-01 12:02:00,692 : INFO : EPOCH 1 - PROGRESS: at 17.15% examples, 109839 words/s,

2020-09-01 12:04:12,562 : INFO : EPOCH 1 - PROGRESS: at 17.72% examples, 106526 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:13,686 : INFO : EPOCH 1 - PROGRESS: at 17.73% examples, 106522 words/s, in_qsize 40, out_qsize 0
2020-09-01 12:04:14,733 : INFO : EPOCH 1 - PROGRESS: at 17.74% examples, 106522 words/s, in_qsize 39, out_qsize 3
2020-09-01 12:04:15,785 : INFO : EPOCH 1 - PROGRESS: at 17.75% examples, 106531 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:16,837 : INFO : EPOCH 1 - PROGRESS: at 17.76% examples, 106540 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:17,860 : INFO : EPOCH 1 - PROGRESS: at 17.76% examples, 106533 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:18,967 : INFO : EPOCH 1 - PROGRESS: at 17.77% examples, 106547 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:20,049 : INFO : EPOCH 1 - PROGRESS: at 17.78% examples, 106550 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:04:21,099 : INFO : EPOCH 1 - PROGRESS: at 17.79% examples, 106554 words/s,

2020-09-01 12:05:29,457 : INFO : EPOCH 1 - PROGRESS: at 18.35% examples, 106814 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:30,488 : INFO : EPOCH 1 - PROGRESS: at 18.36% examples, 106803 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:31,553 : INFO : EPOCH 1 - PROGRESS: at 18.36% examples, 106811 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:32,582 : INFO : EPOCH 1 - PROGRESS: at 18.37% examples, 106803 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:33,841 : INFO : EPOCH 1 - PROGRESS: at 18.38% examples, 106822 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:35,063 : INFO : EPOCH 1 - PROGRESS: at 18.39% examples, 106809 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:36,518 : INFO : EPOCH 1 - PROGRESS: at 18.40% examples, 106817 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:37,559 : INFO : EPOCH 1 - PROGRESS: at 18.41% examples, 106834 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:05:39,046 : INFO : EPOCH 1 - PROGRESS: at 18.42% examples, 106820 words/s,

2020-09-01 12:06:47,875 : INFO : EPOCH 1 - PROGRESS: at 18.97% examples, 107071 words/s, in_qsize 38, out_qsize 1
2020-09-01 12:06:48,889 : INFO : EPOCH 1 - PROGRESS: at 18.98% examples, 107088 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:06:49,893 : INFO : EPOCH 1 - PROGRESS: at 18.99% examples, 107082 words/s, in_qsize 37, out_qsize 2
2020-09-01 12:06:50,932 : INFO : EPOCH 1 - PROGRESS: at 19.00% examples, 107094 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:06:52,044 : INFO : EPOCH 1 - PROGRESS: at 19.01% examples, 107091 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:06:53,067 : INFO : EPOCH 1 - PROGRESS: at 19.02% examples, 107104 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:06:54,167 : INFO : EPOCH 1 - PROGRESS: at 19.02% examples, 107098 words/s, in_qsize 38, out_qsize 1
2020-09-01 12:06:55,212 : INFO : EPOCH 1 - PROGRESS: at 19.03% examples, 107114 words/s, in_qsize 40, out_qsize 1
2020-09-01 12:06:56,226 : INFO : EPOCH 1 - PROGRESS: at 19.04% examples, 107095 words/s,

2020-09-01 12:08:06,117 : INFO : EPOCH 1 - PROGRESS: at 19.60% examples, 107321 words/s, in_qsize 38, out_qsize 1
2020-09-01 12:08:07,132 : INFO : EPOCH 1 - PROGRESS: at 19.61% examples, 107330 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:08:08,242 : INFO : EPOCH 1 - PROGRESS: at 19.61% examples, 107331 words/s, in_qsize 38, out_qsize 1
2020-09-01 12:08:09,284 : INFO : EPOCH 1 - PROGRESS: at 19.62% examples, 107346 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:08:10,376 : INFO : EPOCH 1 - PROGRESS: at 19.63% examples, 107336 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:08:11,405 : INFO : EPOCH 1 - PROGRESS: at 19.64% examples, 107337 words/s, in_qsize 39, out_qsize 1
2020-09-01 12:08:12,548 : INFO : EPOCH 1 - PROGRESS: at 19.65% examples, 107344 words/s, in_qsize 39, out_qsize 0
2020-09-01 12:08:13,673 : INFO : EPOCH 1 - PROGRESS: at 19.65% examples, 107344 words/s, in_qsize 39, out_qsize 2
2020-09-01 12:08:14,712 : INFO : EPOCH 1 - PROGRESS: at 19.66% examples, 107352 words/s,

In [None]:
model.save(MODEL_PATH + 'mdl')

f = open(MODEL_PATH + 'description.txt', 'w')
f.write('date: 08.05\n')
f.write('params: size=300, min_count=1, window=5, negative=10, word_ngrams=5, epochs=5\n')
f.write(f'sourse: {SOURCE_PATH}\n')
f.write('cities: moscow, spb; only russian\n')
f.write('without deleting hastags \n')
print('success')
f.close()

In [7]:
with open('tests.txt') as f:
    tests = [[word for word in line.split()] for line in f]

for test in tests:
    print(model.wv.similarity(test[0], test[1]), test[0], test[1])
    

0.80805814 8марта 23февраля
0.27433598 8марта выставка
0.48473305 8марта рождество
0.37270707 8марта свадьба
0.53255945 8марта 9мая
0.24352027 8марта сплин
0.49690336 сплин слот
0.57352257 сплин rammstein
0.32789782 сплин рождество
0.2025525 сплин свадьба
0.20832917 сплин маникюр
0.26205045 сплин спб
0.65433633 сплин ддт
0.60260904 сплин концерт
0.5566946 сплин песня
0.5037586 сплин нойз
0.6719568 зенит ска
0.61660373 зенит хоккей
0.6510804 зенит баскетбол
0.39403272 зенит сплин
0.28050265 зенит 8марта
0.30077258 зенит рождество
0.7404321 зенит футбол
0.6728875 зенит арена
0.48191747 зенит мяч
0.6609002 зенит гол
0.79878855 зенит спартак
0.19693191 зенит маникюр
0.16906635 зенит сушь
0.37501115 зенит тенис
0.46583143 тенис волебол
0.22568047 тенис марафон
0.23857808 тенис лыжа
0.38073522 зенит спорт
0.50807 тенис спорт
0.5412748 баскетбол спорт
0.6130132 итмо университет


In [7]:
with open('tests.txt') as f:
    tests = [[word for word in line.split()] for line in f]

for test in tests:
    print(model.wv.similarity(test[0], test[1]), test[0], test[1])

0.77465004 8марта 23февраля
0.26400387 8марта выставка
0.44828498 8марта рождество
0.3088644 8марта свадьба
0.47429362 8марта 9мая
0.19591044 8марта сплин
0.3752808 сплин слот
0.5153809 сплин rammstein
0.2440366 сплин рождество
0.17200556 сплин свадьба
0.12906846 сплин маникюр
0.14062892 сплин спб
0.6133099 сплин ддт
0.55933976 сплин концерт
0.56913275 сплин песня
0.50104636 сплин нойз
0.6024536 зенит ска
0.5754436 зенит хоккей
0.5914444 зенит баскетбол
0.32471004 зенит сплин
0.22286703 зенит 8марта
0.24321106 зенит рождество
0.7139603 зенит футбол
0.6243685 зенит арена
0.42467847 зенит мяч
0.6441635 зенит гол
0.7541795 зенит спартак
0.13084683 зенит маникюр
0.11200154 зенит сушь
0.36260748 зенит тенис
0.43142712 тенис волебол
0.16143346 тенис марафон
0.4329789 тенис лыжа
0.29186195 зенит спорт
0.43753514 тенис спорт
0.5064553 баскетбол спорт
0.6249611 итмо университет
