In [1]:
import os.path

directory_with_texts = 'texts-in-subdirs/'

paths_to_files = []

for root, subdirs, files in os.walk(directory_with_texts):
    for filename in files:
        if filename.endswith('.txt'):
            path = os.path.join(root, filename)
            paths_to_files.append(path)

print('Searching in directory:\n\t{}'.format(directory_with_texts))
print('Number of TXT files found:\n\t{}'.format(len(paths_to_files)))

Searching in directory:
	texts-in-subdirs/
Number of TXT files found:
	40


In [2]:
corpus = []

try:
    for path in paths_to_files[:]: # easy to limit as [:5] for example
        with open(file = path, mode = 'r', encoding = 'utf-8') as document:
            document = document.read()
            document = document.split(sep = None) # better use regexp ?
            document = ' '.join(document)
            if document != '':
                corpus.append(document)
            else:
                print('Skipped since no text found within:\n\t{}'.format(path))
                paths_to_files.remove(path)
except UnicodeDecodeError:
    print('ERROR! Make sure files are encoded in UTF-8 without BOM')

if len(corpus) != 0:
    print('Number of documents going to corpus:\n\t{}'.format(len(corpus)))
else:
    print('ERROR! No texts found in directory:\n\t{}'.format(directory_with_texts))

Number of documents going to corpus:
	40


In [3]:
from tqdm import tqdm

from mosestokenizer import MosesPunctuationNormalizer, MosesTokenizer

corpus_tokenized = []

for document in tqdm(corpus, desc='Tokenization with mosestokenizer', leave=True):
    with MosesPunctuationNormalizer('ru') as normalize, MosesTokenizer('ru') as tokenize:
        document_normalized = normalize(document)
        document_tokenized = tokenize(document_normalized)
        corpus_tokenized.append(document_tokenized)

Tokenization with mosestokenizer: 100%|██████████| 40/40 [00:03<00:00, 14.22it/s]


In [4]:
# fixing dashed tokens like 'кое-где' and 'ку-ка-ре-ку'
for document_tokenized in corpus_tokenized:
    while '@-@' in document_tokenized:
        for index, token in enumerate(document_tokenized):
            if token == '@-@':
                document_tokenized[index] = document_tokenized[index - 1] + '-' + document_tokenized[index + 1]
                document_tokenized.remove(document_tokenized[index + 1]) # order matters
                document_tokenized.remove(document_tokenized[index - 1]) # order matters

In [5]:
import pymorphy2
pymorphy = pymorphy2.MorphAnalyzer()

corpus_lemma = []

for document_tokenized in tqdm(corpus_tokenized, desc='Lemmatization by pymorphy', leave=True):
    one_doc_lemmas = []
    for token in document_tokenized:        
        lemm = pymorphy.parse(token)[0].normal_form
        one_doc_lemmas.append(lemm)
    corpus_lemma.append(one_doc_lemmas)

Lemmatization by pymorphy: 100%|██████████| 40/40 [00:17<00:00,  1.36it/s]


In [6]:
# use ord('symbol') for get more ascii
punctuation_soup = [['&apos;', '&quot;'], # from mosestokenizer's normalization
                    ['..', '...', '....'], # several symbols
                    [chr(i) for i in range(33, 47 + 1)],
                    [chr(i) for i in range(58, 64 + 1)],
                    [chr(i) for i in range(91, 96 + 1)],
                    [chr(i) for i in range(123, 192 + 1)],
                    [chr(i) for i in range(215, 216 + 1)],
                    [chr(i) for i in range(690, 879 + 1) ],
                    [chr(i) for i in range(8190, 8471 + 1)]]

punctuation_soup = sum(punctuation_soup, [])

In [7]:
file_with_stop_words = 'stop_words.txt'

stop_words = []

try:
    with open(file = file_with_stop_words, mode = 'r', encoding = 'utf-8') as words:
        words = words.readlines()
        for word in words:
            word = word.split(sep = None) # any whitespace characters
            word = ' '.join(word)
            word = pymorphy.parse(word)[0].normal_form
            if word != '':
                stop_words.append(word)
except FileNotFoundError:
    print('Note! File "{}" not found.'.format(file_with_stop_words))
except UnicodeDecodeError:
    print('ERROR! Make sure file "{}" is encoded in UTF-8 without BOM'.format(file_with_stop_words))

stop_words = list(set(stop_words))
stop_words = sorted(stop_words)
print('Number of stop words predefined in file "{}":\n\t{}'.format(file_with_stop_words, len(stop_words)))

Number of stop words predefined in file "stop_words.txt":
	193


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(preprocessor = lambda x: x, # done before for better control
                        tokenizer = lambda x: x, # done before for better control
                        token_pattern = None, # for sure
                        lowercase = False, # for sure
                        min_df = 1, # no cut-off
                        #strip_accents = 'unicode', # maybe not needed
                        stop_words = stop_words + punctuation_soup,
                        norm = 'l2') # 'l2' in future

In [9]:
print('Some of TFIDF parameters:')
print('\t max_df:\t{}'.format(tfidf.get_params().get('max_df')))
print('\t min_df:\t{}'.format(tfidf.get_params().get('min_df')))
print('\t norm:\t\t{}'.format(tfidf.get_params().get('norm')))
print('\t smooth_idf:\t{}'.format(tfidf.get_params().get('smooth_idf')))
print('\t sublinear_tf:\t{}'.format(tfidf.get_params().get('sublinear_tf')))

Some of TFIDF parameters:
	 max_df:	1.0
	 min_df:	1
	 norm:		l2
	 smooth_idf:	True
	 sublinear_tf:	False


In [10]:
tfidf_matrix = tfidf.fit_transform(corpus_lemma)

max_tfidf = tfidf_matrix.max()

terms = tfidf.get_feature_names()

tfidf_lists = tfidf_matrix.toarray().tolist()

In [11]:
# let's create list of dictionaries where keys are words, and values are TFIDF
from tqdm import trange

tfidf_dicts = []
for each_row in trange(len(tfidf_lists), desc='Bag of words and TFIDF', leave=True):
    zipper = list(zip(terms, tfidf_lists[each_row]))
    tfidf_dicts.append(dict(zipper))

Bag of words and TFIDF: 100%|██████████| 40/40 [00:00<00:00, 330.59it/s]


In [12]:
# let's create dictionary where keys are words, and values are IDF
zipper = zip(terms, tfidf.idf_)
idf_dict = dict(zipper)

In [13]:
tf = TfidfVectorizer(preprocessor = lambda x: x, # done before for better control
                     tokenizer = lambda x: x, # done before for better control
                     token_pattern = None, # for sure
                     lowercase = False, # for sure
                     min_df = 1, # no cut-off
                     #strip_accents = 'unicode', # maybe not needed
                     stop_words = stop_words + punctuation_soup,
                     use_idf = False,
                     norm = 'l2') # 'l2' in future

In [14]:
tf_matrix = tf.fit_transform(corpus_lemma)

tf_lists = tf_matrix.toarray().tolist()

In [15]:
# let's create list of dictionaries where keys are words, and values are TF
tf_dicts = []
for each_row in range(len(tf_lists)):
    zipper = list(zip(terms, tf_lists[each_row]))
    tf_dicts.append(dict(zipper))

In [16]:
# let's create dictionary where keys are words, and values are TF

terms_counts = []

for each_lemm in tf_dicts[0]: # doesn't matter
    doc_term_count =[]
    for index, each_row in enumerate(tf_dicts):
        doc_term_count.append(tf_dicts[index].get(each_lemm))
    terms_counts.append(sum(doc_term_count))

zipper = zip(tf_dicts[0], terms_counts) # doesn't matter
term_global_freq = dict(zipper)

In [17]:
# let's create dictionary where keys are words, and values are
# number of different docs with such a word

docs_counts = []

for each_lemm in tf_dicts[0]: # все равно из какого брать все слова
    count = 0
    for ind, e in enumerate(tf_dicts):    
        if tf_dicts[ind].get(each_lemm) != 0:
            count += 1
    docs_counts.append(count)

zipper = zip(tf_dicts[0], docs_counts)
diff_docs = dict(zipper)

In [18]:
corpus_tagged = []

number_of_docs = len(corpus) 

for index, document_tokenized in enumerate(tqdm(corpus_tokenized, desc='Corpus HTML tagging', leave=True)):
    document_tagged = []

    for token in document_tokenized:
        lemma = pymorphy.parse(token)[0].normal_form
        if token in punctuation_soup:
            document_tagged.append(token)
        elif lemma in stop_words:
            document_tagged.append('<stopword>'+ token + '</stopword>')
        else:
            try:
                tfidf_value = tfidf_dicts[index].get(lemma)
                tf_value = tf_dicts[index].get(lemma)
                idf_value = idf_dict.get(lemma)
                global_freq_value = term_global_freq.get(lemma)
                diff_value = diff_docs.get(lemma)
                word_size = str('%.2f' % (100 + tfidf_value*200/max_tfidf))
                info_soup = ('word {tab} {tab} {tab} {tab} {token} {carriage}'
                             'calculations for lemma {tab} {lemma} {carriage}'
                             'tfidf {tab} {tab} {tab} {tab} {tab} {tfidf_value} {carriage}'                             
                             'idf {tab} {tab} {tab} {tab} {tab} {idf_value} {carriage}'
                             #'# in this doc (tf) {tab} {tab} {tf_value} {carriage}'
                             #'# in whole corpus {tab} {tab} {global_freq_value} {carriage}'
                             '# in different docs {tab} {tab} {diff_value} {carriage}'
                             '# docs in corpus {tab} {tab} {number_of_docs}')
                info_soup = info_soup.format(tab = '&#009;',
                                             carriage = '&#013;',
                                             token = token,
                                             lemma = lemma,
                                             tfidf_value = '%.2f' % tfidf_value,
                                             tf_value = int(tf_value),
                                             idf_value = '%.2f' % idf_value,
                                             global_freq_value = int(global_freq_value),
                                             diff_value = diff_value,
                                             number_of_docs = number_of_docs)
                document_tagged.append('<span style = "font-size: ' + word_size + '%"; title = "' + info_soup + '">' + token + '</span>')
            except TypeError:
                print("Probably some punctuation problem with rare symbol.")

    corpus_tagged.append(document_tagged)

Corpus HTML tagging: 100%|██████████| 40/40 [00:18<00:00,  1.30it/s]


In [19]:
from mosestokenizer import MosesDetokenizer

corpus_tagged_detoken = []

with MosesDetokenizer('ru') as detokenize:
    for document_tokenized_tagged in tqdm(corpus_tagged, desc='Detekonizing with mosestokeniser', leave=True):
        corpus_tagged_detoken.append(detokenize(document_tokenized_tagged))

Detekonizing with mosestokeniser: 100%|██████████| 40/40 [00:51<00:00,  2.00s/it]


In [20]:
template_for_html = '''
<!doctype html>
<html lang = 'ru'>

  <head>
    <meta charset = 'utf-8'>
      <title>tfidf mapped to size</title>
    <style>
      stopword {{color: rgb(128, 128, 128)}}
      hr {{border-top: 2px solid black}}
      filename {{text-align: center; display: block}}
    </style>
  </head>

  <body>
    <p>{}</p>
  </body>

</html>
'''

#p {{line-height: 50px}}

In [21]:
# adding file's name as title
corpus_tagged_titled = []

for index, tale in enumerate(corpus_tagged_detoken):
    title = '<filename>' + paths_to_files[index] + '</filename>'
    delimeter = '<hr>'
    corpus_tagged_titled.append("".join([title, tale, delimeter]))

In [22]:
html_output = template_for_html.format('</p><p>'.join(corpus_tagged_titled[:]))

In [23]:
dir_for_output = 'output'
filename = 'tfidf_mapped_to_size.html'

os.makedirs(os.path.dirname(os.path.join(dir_for_output, filename)), exist_ok=True)

with open(file = os.path.join(dir_for_output, filename), mode = 'w', encoding = 'utf-8') as f:
    f.write(html_output)

In [24]:
import webbrowser

webbrowser.open_new_tab(os.path.join(dir_for_output, filename))

True