# Автоматическая обработка текстов

По ссылке находится произведение русской классики. Ваша задача -- применить RNNMorph, для анализа произведения.

In [1]:
import warnings
warnings.filterwarnings('ignore')

!pip -q install bs4 nltk rnnmorph

import nltk
nltk.download('punkt')
nltk.download("stopwords")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Building wheel for rnnmorph (setup.py) ... [?25l[?25hdone
  Building wheel for russian-tagsets (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
DATA_URL = "http://az.lib.ru/g/gogolx_n_w/text_0050.shtml"
TOP_BATCH_SIZE = 150
MIN_COUNT_THRESHOLD = 10

In [3]:
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from rnnmorph.predictor import RNNMorphPredictor


with urllib.request.urlopen(DATA_URL) as response:
  raw_html = response.read().decode(response.headers.get_content_charset())
soup = BeautifulSoup(raw_html, features="html.parser")
for tag in soup(["script", "style"]):
  tag.extract()
text_data = soup.get_text()

tokenized = [word_tokenize(sentence) for sentence in sent_tokenize(text_data)]
predictor = RNNMorphPredictor(language="ru")
normalized = [[token.normal_form for token in sentence if token.word.isalpha()] 
              for sentence in predictor.predict_sentences(tokenized)]
tokens = [token for sentence in normalized for token in sentence]
print('Количество предложений: ', len(normalized))
print('Количество токенов, состоящих только из букв:', len(tokens))

tokens_count = nltk.FreqDist(tokens)
stopwords = set(nltk.corpus.stopwords.words("russian"))
print(f'Доля слов, не входящих в стоп-лист, среди {TOP_BATCH_SIZE} самых частотных:', 
      sum(token not in stopwords for token, _ in tokens_count.most_common(TOP_BATCH_SIZE)) / TOP_BATCH_SIZE)
print(f'Количество токенов, встречающихся в тексте строго больше {MIN_COUNT_THRESHOLD} раз: ',
      sum(count > 10 for token, count in tokens_count.items()))

Количество предложений:  934
Количество токенов, состоящих только из букв: 11732
Доля слов, не входящих в стоп-лист, среди 150 самых частотных: 0.5733333333333334
Количество токенов, встречающихся в тексте строго больше 10 раз:  161
