# Обработка исходных файлов. Удаление и парсинг

In [None]:
import glob
from xml.dom import minidom
import os
from nlp import nlp
import torch


def get_fb2_filenames():
    return glob.iglob('/Users/kukuruku/Documents/librusec/Libruks/Архивы Либрусек/**/*.fb2', recursive=True)


def get_title_info(node):
    description = node.getElementsByTagName('title-info')
    assert len(description) == 1, 'только одно описание'
    
    return description[0]


def get_genres(node):
    genre = node.getElementsByTagName('genre')

    return {g.childNodes[0].data for g in genre if len(g.childNodes)}


def find_lang(node):
    lang = node.getElementsByTagName('lang')
    assert len(lang) <= 1, 'указано небольше одного языка'
    
    return lang[0].childNodes[0].data if len(lang) and len(lang[0].childNodes) else None

def get_title(node):
    title = node.getElementsByTagName('book-title')
    assert len(title) == 1 and len(title[0].childNodes) == 1, 'указано название книги'
    
    return title[0].childNodes[0].data

def has_children_genre(genres):
    for g in genres:
        if 'child' in g:
            return True
        
    return False


def get_text(doc):
    result = []
    
    for body in doc.getElementsByTagName('body'):
        for paragraph in body.getElementsByTagName('v'):
            collect_text(paragraph, result)
        
        for paragraph in body.getElementsByTagName('p'):
            if paragraph.parentNode.tagName not in ['section', 'cite']:
                continue
                
            collect_text(paragraph, result)

    return ' '.join(result)


def collect_text(node, result):
    if node.nodeType == node.TEXT_NODE:
        result.append(node.data)
        
    for child in node.childNodes:
        collect_text(child, result)


def process_all():
    count = 0
    
    for filename in get_fb2_filenames():
        stanza_filename = f"{os.path.splitext(filename)[0]}.stanza"
        
        try:
            count += 1

            if count % 100 == 0:
                print('№', count)

            if os.path.isfile(stanza_filename):
                continue

            book = minidom.parse(filename)
            info = get_title_info(book)

            genres = get_genres(info)

            if not has_children_genre(genres):
#                 os.remove(filename)
                raise "Удаляет что не нужно"
                continue

            lang = find_lang(info)

            if not lang or lang.lower() != 'ru':
#                 os.remove(filename)
                raise "Удаляет что не нужно"
                continue

            with torch.no_grad():
                with open(stanza_filename, 'wb') as stanza_file:
                    nlp_doc = nlp(get_text(book))
                    stanza_file.write(nlp_doc.to_serialized())

        except Exception as e:
            print(filename, e)
            
process_all()

In [77]:

'l'.lower()

doc = minidom.parse('/Users/kukuruku/Documents/librusec/Libruks/Архивы Либрусек/182800-183232/183031.fb2')

print(doc.toprettyxml())


def get_text(doc):
    result = []
    
    for body in doc.getElementsByTagName('body'):
        for paragraph in body.getElementsByTagName('v'):
            collect_text(paragraph, result)
        
        for paragraph in body.getElementsByTagName('p'):
            if paragraph.parentNode.tagName not in ['section', 'cite']:
                continue
                
            collect_text(paragraph, result)

    return ' '.join(result)


def collect_text(node, result):
    if node.nodeType == node.TEXT_NODE:
        result.append(node.data)
        
    for child in node.childNodes:
        collect_text(child, result)


nd = nlp(get_text(doc))



<?xml version="1.0" ?>
<FictionBook xmlns:l="http://www.w3.org/1999/xlink" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.gribuser.ru/xml/fictionbook/2.0">
	
	
	<description>
		
		
		<title-info>
			
			
			<genre>child_verse</genre>
			
			
			<author>
				
				
				<first-name>Доктор</first-name>
				
				
				<last-name>Сьюз</last-name>
				
			
			</author>
			
			
			<book-title>Слон Хортон слышит кого-то</book-title>
			
			
			<date/>
			
			
			<lang>ru</lang>
			
			
			<sequence name="Сказки про слона Хортона"/>
			
		
		</title-info>
		
		
		<document-info>
			
			
			<author>
				
				
				<first-name/>
				
				
				<last-name/>
				
			
			</author>
			
			
			<program-used>, Fiction Book Investigator</program-used>
			
			
			<date>24.01.2010</date>
			
			
			<src-url/>
			
			
			<src-ocr/>
			
			
			<id>FBD-337E4E-6BB4-C743-9CA6-3005-45E8-84A74F</id>
			
			
			<version>1.02</version>
			
		
		</document-info>
		
		
		<publish-info>
		</publish-info>
		

In [78]:
nd

[
  [
    {
      "id": 1,
      "text": "Слон",
      "lemma": "слон",
      "upos": "NOUN",
      "feats": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
      "head": 3,
      "deprel": "nsubj",
      "misc": "start_char=0|end_char=4"
    },
    {
      "id": 2,
      "text": "Хортон",
      "lemma": "Хортон",
      "upos": "PROPN",
      "feats": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
      "head": 1,
      "deprel": "appos",
      "misc": "start_char=5|end_char=11"
    },
    {
      "id": 3,
      "text": "слышит",
      "lemma": "слышать",
      "upos": "VERB",
      "feats": "Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act",
      "head": 0,
      "deprel": "root",
      "misc": "start_char=12|end_char=18"
    },
    {
      "id": 4,
      "text": "кого-то",
      "lemma": "кто-то",
      "upos": "PRON",
      "feats": "Case=Acc",
      "head": 3,
      "deprel": "obj",
      "misc": "start_char=19|end_char=26"
    },
    {
      "id": 5,


True