## Извлекаем слова и словосочетания, которые похожи на термины.

### Извлекаем все шрифты и размеры шрифтов, задействованные в работе.

In [None]:
if __name__ == '__main__' and '__file__' not in globals():
    %reset

In [None]:
if __name__ == '__main__' and '__file__' not in globals():
    %run common_info.ipynb

#### Здесь генерируется xml-файл, но в Jupyter Notebook это почему-то не работает.

Можно заменить командой в командной строке:
<code>pdf2txt.py -o *graduationWorkXMLFile* *graduationWorkPDFFile*</code>. Генерировать нужно в папку с исходным файлом.

In [None]:
import subprocess

if __name__ == '__main__' and '__file__' not in globals():
    subprocess.check_call(["pdf2txt.py", "-o", graduationWorkXMLFile, graduationWorkPDFFile])

In [None]:
import xml.etree.ElementTree as etree
from decimal import Decimal, getcontext
from collections import Counter
 
def getTextFontsAndSizes(graduationWorkXMLFile):
    getcontext().rounding = 'ROUND_HALF_UP'
    
    with open(graduationWorkXMLFile, 'r') as graduationWork:
        xmlTree = etree.parse(graduationWork)        
        
        textFonts = Counter()
        textSizes = Counter()
#       считаем самый частый шрифт (предположительно шрифт основного текста) и
#       самый частый размер шрифта (предположительно размер шрифта основного текста) 
        for page in xmlTree.getroot():
            for textbox in page:
                for textline in textbox:
                    for text in textline:
                        if 'font' in text.attrib:
                            textFonts[text.attrib['font']] += 1
                        if 'size' in text.attrib:
                            roundedSize = float(Decimal(text.attrib['size']).quantize(Decimal('.0')))
                            totalRoundedSize = int(Decimal(text.attrib['size']).quantize(Decimal('1')))
                            textSizes[roundedSize if roundedSize - int(roundedSize) == 0.5 else totalRoundedSize] += 1
        
    return textFonts, textSizes

In [None]:
textFonts, textSizes = getTextFontsAndSizes(graduationWorkXMLFile)

mostCommonTextFont = textFonts.most_common(1)[0][0]
mostCommonTextSize = textSizes.most_common(1)[0][0]

if __name__ == '__main__' and '__file__' not in globals():
    print(mostCommonTextFont, mostCommonTextSize)

### Извлекаем все слова и словосочения, которые как-то выделены в работе среди основного текста.

In [None]:
import xml.etree.ElementTree as etree
from decimal import Decimal, getcontext
from collections import Counter
from string import punctuation

def extractTerms(graduationWorkXMLFile, mostCommonTextSize, mostCommonTextFont):
    getcontext().rounding = 'ROUND_HALF_UP'
    
    terms = []
    with open(graduationWorkXMLFile, 'r') as graduationWork:
        xmlTree = etree.parse(graduationWork)  
        
        term = ''
#       вытаскиваем предположительные термины из текста (смотрим на вариации самого популярного шрифта: Bold, Italic)
        for page in xmlTree.getroot():
            for textbox in page:
                for textline in textbox:
                    for text in textline:
                        if 'font' in text.attrib and 'size' in text.attrib:
                            roundedSize = float(Decimal(text.attrib['size']).quantize(Decimal('.0')))
                            totalRoundedSize = int(Decimal(text.attrib['size']).quantize(Decimal('1')))
                            symbolSize = roundedSize if roundedSize - int(roundedSize) == 0.5 else totalRoundedSize
                            
                            if (symbolSize == mostCommonTextSize and \
                                (text.attrib['font'].find('Italic') != -1 or \
                                 text.attrib['font'].find('Bold') != -1) and \
                                text.attrib['font'].find(mostCommonTextFont) != -1) or \
                                text.text in punctuation+' ':
                                term += text.text
                            elif text.text == '\n':
                                term += ' '
                            else:
                                term = term.strip()
                                if term != '' and term[0] not in punctuation:
                                    terms.append(term)
                                term = ''
    return terms

In [None]:
terms = extractTerms(graduationWorkXMLFile, mostCommonTextSize, mostCommonTextFont)

if __name__ == '__main__' and '__file__' not in globals():
    print(terms)