TEXT

In [37]:
with open('plain_text1.txt', 'r', encoding='utf-8') as f:
  text = f.read()

NLTK

In [38]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

In [39]:
with open('token_NLTK.txt', 'w', encoding='utf-8') as out_f:
  for token in tokens:
    out_f.write(token + '\n')

STANZA

In [40]:
import stanza
nlp = stanza.Pipeline(lang='ru', processors='tokenize', tokenize_no_ssplit=True)
doc = nlp(text)
with open('token_STANZA.txt', 'w', encoding='utf-8') as out_f:
    for sentence in doc.sentences:
        for token in sentence.tokens:
            out_f.write(token.text + '\n')

2024-09-18 13:41:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-09-18 13:41:50 INFO: Downloaded file to /Users/sergey/stanza_resources/resources.json
2024-09-18 13:41:50 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

2024-09-18 13:41:50 INFO: Using device: cpu
2024-09-18 13:41:50 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-09-18 13:41:50 INFO: Done loading processors!


RAZDEL

In [41]:
from razdel import tokenize
tokens = list(tokenize(text))
with open('token_RAZDEL.txt', 'w', encoding='utf-8') as out_f:
  for token in tokens:
    out_f.write(token.text + '\n')

SPACY

In [42]:
import spacy
nlp = spacy.blank("ru")
doc = nlp(text)

with open('token_SPACY.txt', 'w', encoding='utf-8') as out_f:
  for token in doc:
      if token.text.strip():
            out_f.write(token.text + '\n')

MOSES

In [43]:
from mosestokenizer import MosesTokenizer
tokenize = MosesTokenizer('ru')
text = text.replace('\n', ' ')
tokens = tokenize(text)
with open('token_MOSES.txt', 'w', encoding='utf-8') as out_f:
  for token in tokens:
    out_f.write(token + '\n')

stdbuf was not found; communication with perl may hang due to stdio buffering.


PYMORPHY3

In [44]:
from pymorphy3.tokenizers import simple_word_tokenize

tokens = simple_word_tokenize(text)

with open('token_PYMORPHY.txt', 'w', encoding='utf-8') as out_f:
    for token in tokens:
        out_f.write(token + '\n')

SEGTOK

In [45]:
from segtok.tokenizer import word_tokenizer
tokens = word_tokenizer(text)
with open('token_SEGTOK.txt', 'w', encoding='utf-8') as out_f:
    for token in tokens:
        out_f.write(token + '\n')

UDPIPE

In [49]:
import ufal
from ufal import udpipe
model_path = 'russian-syntagrus-ud-2.0-170801.udpipe'
models = ufal.udpipe.Model.load(model_path)
pipeline = ufal.udpipe.Pipeline(
    models,
    'tokenize',
    ufal.udpipe.Pipeline.DEFAULT,
    ufal.udpipe.Pipeline.DEFAULT,
    ufal.udpipe.Pipeline.DEFAULT
)
processed_text = pipeline.process(text)
array_ufal = []
try:
    for line in processed_text.split('\n'):
        parts = line.split('\t')
        if len(parts) > 1:
            array_ufal.append(parts[1])
finally:
    with open ('token_UFAL.txt', 'w') as f_ufal:
        for item in array_ufal:
            f_ufal.write(item)
            f_ufal.write('\n')

RESULTS

In [53]:
import pandas as pd
import os
from openpyxl import load_workbook

def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    return [line.strip() for line in lines]

def create_excel_from_txt(file_paths, column_names, output_file):
    df = pd.DataFrame()

    for i, file_path in enumerate(file_paths):
        if os.path.exists(file_path):
            tokens = process_text_file(file_path)
            column_name = column_names[i]
            df[column_name] = pd.Series(tokens)
        else:
            print(f"Файл {file_path} не найден.")
    
    df.to_excel(output_file, index=False)

    wb = load_workbook(output_file)
    ws = wb.active

    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter 
        for cell in col:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2) * 2 
        ws.column_dimensions[column].width = adjusted_width

    wb.save(output_file)
    print(f"The file is saved in the directory {output_file}")

if __name__ == "__main__":
    file_paths = [
        '/Users/sergey/PycharmProjects/Tokenization/token_NLTK.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_MOSES.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_PYMORPHY.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_RAZDEL.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_SEGTOK.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_SPACY.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_STANZA.txt',
        '/Users/sergey/PycharmProjects/Tokenization/token_UFAL.txt',
    ]
    
    column_names = [
        'NLTK', 
        'MOSES', 
        'PYMORPHY', 
        'RAZDEL', 
        'SEGTOK', 
        'SPACY',
        'STANZA',
        'UFAL'
    ]
    
    output_file = '/Tokenization/output_tokens.xlsx'
    
    create_excel_from_txt(file_paths, column_names, output_file)

The file is saved in the directory /Users/sergey/PycharmProjects/Tokenization/output_tokens.xlsx
