In [9]:
import re
import os
from tqdm import tqdm

In [91]:
def slurp(path):
    try:
        with open(path, 'r') as fo:
            text = fo.read()
    except UnicodeDecodeError:
        print(path)
        with open(path, 'r', encoding='cp1252') as fo:
            text = fo.read()
    return text

def spit(texts, file_names):
    for text, file_name in tqdm(zip(texts, file_names)):
        with open(file_name, 'w') as fo:
            fo.write(text)

def read_dir(input_path):
    texts = []
    files = []
    print('Reading files...')
    for root, dirs, filenames in os.walk(input_path):
        files.extend(filenames)
        for filename in tqdm(filenames):
            file_path = os.path.join(root, filename)
            if '.ipynb' not in file_path:
                text = slurp(file_path)
                texts.append(text)
    print('Number of texts: ', len(texts))
    return texts, files

def preprocess(input_path, output_path):
    texts, filenames = read_dir(input_path)
    pattern = re.compile(r'[А-ЯЁа-яё\.\-\d]+')
    preprocessed = []
    print('Preprocessing files...')
    for text in tqdm(texts):
        preproc_text = ' '.join(re.findall(pattern, text))
        preprocessed.append(preproc_text)
    paths = [output_path + name for name in filenames if 'ipynb' not in name]
    print('Number of texts: ', len(texts))
    print('Number of paths:', len(paths))
    print('Writing to files...')
    spit(preprocessed, paths)
    print('All done, Buddy!')

## Preprocess texts before parsing

In [81]:
spit_dir = '/home/nst/mount/data/linguistics_hse/popular-science-research/Tomita_Parser/tomita-parser/build/bin/sci_corpus/'
slurp_dir = '/home/nst/mount/data/share/yd/popular_science_texts_store_copy'

In [92]:
chrdk = preprocess(slurp_dir, spit_dir)


0it [00:00, ?it/s][A
[A
0it [00:00, ?it/s][A
[A
  0%|          | 0/707 [00:00<?, ?it/s][A

Reading files...



  1%|          | 6/707 [00:00<00:12, 54.75it/s][A
  2%|▏         | 14/707 [00:00<00:11, 61.65it/s][A
  3%|▎         | 20/707 [00:00<00:11, 59.24it/s][A
  4%|▍         | 29/707 [00:00<00:10, 65.30it/s][A
  5%|▌         | 38/707 [00:00<00:10, 64.70it/s][A
  7%|▋         | 47/707 [00:00<00:09, 68.41it/s][A
  8%|▊         | 57/707 [00:00<00:09, 71.64it/s][A
  9%|▉         | 65/707 [00:00<00:08, 72.38it/s][A
 10%|█         | 73/707 [00:01<00:08, 72.22it/s][A
 11%|█▏        | 81/707 [00:01<00:08, 71.68it/s][A
 13%|█▎        | 90/707 [00:01<00:08, 73.06it/s][A
 14%|█▍        | 98/707 [00:01<00:08, 72.23it/s][A
 15%|█▌        | 108/707 [00:01<00:08, 74.06it/s][A
 17%|█▋        | 117/707 [00:01<00:07, 74.23it/s]Exception in thread Thread-131:
Traceback (most recent call last):
  File "/home/nst/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/nst/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance

Number of texts:  31052
Preprocessing files...


100%|██████████| 31052/31052 [00:13<00:00, 2247.23it/s]
0it [00:00, ?it/s]

Number of texts:  31052
Number of paths: 31052
Writing to files...


31052it [06:37, 78.14it/s] 


All done, Buddy!
