In [1]:
import os
import re
import gensim
import nltk.data
import logging
from os.path import join
from gensim.models import word2vec
from nltk.tokenize import sent_tokenize
from pprint import pprint

# Creating corpora

In [7]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# find all files
# path = your path
paths = []
for i in os.walk(path):
    paths.append(i)
authors = paths[0][2]
print('Authors collected:', len(authors))

Authors collected: 228635


### 'Base' corpus with no added tags

In [11]:
nur = re.compile('нур ?')
with open('stihi.ru_corpus.txt', 'w', encoding='UTF-8') as F:
    for a in authors:
        title = r'C:\Users\User\Desktop\универ\курсовая\stihi_ru_makup_lemmed\stihi_ru_makup_lemmed\\' + a
        with open(title, encoding='UTF-8') as f:
            t = f.read()
        poem = re.sub('(<.*>|\n|\?)', '', t)
        sent = sent_tokenize(poem)
        lem_sent = []        
        for s in sent:
            lemmas = re.findall(r'{(.*?)}', s)
            lem_sent.append(' '.join(lemmas))
        corp = '\n'.join(lem_sent)
        corpus = re.sub(nur, ' ', corp)
        c = F.write(corpus)

### Corpus with /RHYME tag

In [26]:
re_meta = re.compile('(<.*>|\?|\t(.*))')
re_zone = re.compile('{(\w*)}(\W*)$', re.MULTILINE)
rhyme = r'{\1/RHYME}\2'
re_lemmas = re.compile('{(.*?)}')
with open('stihi.ru_corpusRHYMED.txt', 'w', encoding='UTF-8') as F:
    for a in authors:
        title = r'C:\Users\User\Desktop\универ\курсовая\stihi_ru_makup_lemmed\stihi_ru_makup_lemmed\\' + a
        with open(title, encoding='UTF-8') as f:
            t = f.read()
        poem = re.sub(re_meta, '', t)
        poem = re.sub(re_zone, rhyme, poem)
        sent = sent_tokenize(poem)
        lem_sent = []
        for s in sent:
            lemmas = re.findall(re_lemmas, s)
            lem_sent.append(' '.join(lemmas))
        corpus = '\n'.join(lem_sent)
        c = F.write(corpus)

Wall time: 1h 36min 17s


### Only-rhyme corpus

In [None]:
%%time
re_meta = re.compile('(<.*>|\?|\t(.*))')
re_rhyme = re.compile(r'{(\w*)}\W*$', re.MULTILINE)
with open('stihi.ru_rhyme_corpus.txt', 'w', encoding='UTF-8') as F:
    for a in authors:
        title = r'C:\Users\User\Desktop\универ\курсовая\stihi_ru_makup_lemmed\stihi_ru_makup_lemmed\\' + a
        with open(title, encoding='UTF-8') as f:
            t = f.read()
        poem = re.sub(re_meta, '', t)
        stanza = poem.split('\n\n')
        poem_rhymes = []
        for s in stanza:
            st_rhymes = re.findall(re_rhyme, s)
            poem_rhymes.append(' '.join(st_rhymes))
        corpus = '\n'.join(poem_rhymes)
        c = F.write(corpus)

# Creating vector models

### 'Base' corpus

In [None]:
f = 'stihi.ru_corpus.txt'
data = gensim.models.word2vec.LineSentence(f)

In [None]:
# window = 2
model = gensim.models.Word2Vec(data, size=300, window=2, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruWIN2.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 5
model = gensim.models.Word2Vec(data, size=300, window=5, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruWIN5.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 8
model = gensim.models.Word2Vec(data, size=300, window=8, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruWIN8.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 15
model = gensim.models.Word2Vec(data, size=300, window=15, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruWIN15.bin"
model.wv.save_word2vec_format(model_path, binary=True)

### Corpus with /RHYME tag

In [None]:
f = 'stihi.ru_corpusRHYMED.txt'
data = gensim.models.word2vec.LineSentence(f)

In [None]:
# window = 2
model = gensim.models.Word2Vec(data, size=300, window=2, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruRHYMEDWIN2.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 5
model = gensim.models.Word2Vec(data, size=300, window=5, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruRHYMEDWIN5.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 8
model = gensim.models.Word2Vec(data, size=300, window=8, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruRHYMEDWIN8.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 15
model = gensim.models.Word2Vec(data, size=300, window=15, min_count=1000, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.ruRHYMEDWIN15.bin"
model.wv.save_word2vec_format(model_path, binary=True)

### Only-rhyme corpus

In [None]:
f = 'stihi.ru_rhyme_corpus.txt'
data = gensim.models.word2vec.LineSentence(f)

In [None]:
# window = 2
model = gensim.models.Word2Vec(data, size=300, window=2, min_count=1, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.rurhymesWIN2.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 5
model = gensim.models.Word2Vec(data, size=300, window=5, min_count=1, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.rurhymesWIN5.bin"
model.wv.save_word2vec_format(model_path, binary=True)

In [None]:
# window = 7
model = gensim.models.Word2Vec(data, size=300, window=7, min_count=1, workers=4, iter=5)
model.init_sims(replace=True)
model_path = "model_stihi.rurhymesWIN7.bin"
model.wv.save_word2vec_format(model_path, binary=True)