In [1]:
import pandas as pd
import os
import io
import nltk
from nltk.corpus import stopwords
import pdfminer
import re
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import pymorphy2

In [2]:
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text
    else:
        return ''

In [3]:
def create_array_from_file(file):
    array = []
    for l in file:
        k = re.split('\s+', l)
        array.append(k[0])
    return array

In [4]:
def is_in_dictionary(norm, rus_dict_array):
    l = 0
    r = len(rus_dict_array)
    while(l < r-1):
        m = (l+r)//2
        if rus_dict_array[m] <= norm:
            l = m
        else:
            r = m
    return rus_dict_array[l]==norm

In [5]:
def filtration(words):
    stop_simbols = ['{', '}', '[', ']', ',', ':', ';']
    rus_dict = open("../../../dictionaries/dictionary.txt", 'r', encoding='utf-8')
    rus_dict_array = create_array_from_file(rus_dict)
    filtered = []
    morph = pymorphy2.MorphAnalyzer()
    for w in words:
        if (not w in stopwords.words("russian") and len(w) <= 35 and
           not w in stop_simbols):
            p = morph.parse(w)[0]
            norm = p.normal_form
            filtered.append(norm)
        elif w=='.':
            filtered.append(w)
    return filtered

In [6]:
def text_to_file(text, file):
    words = nltk.word_tokenize(text)
    filtered = filtration(words)
    for token in filtered:
        try:
            file.write(token)
            file.write(" ")
        except:
            continue
    file.write("\n")

In [7]:
os.chdir("../dataset/fit_language_models/pdf")
directory = os.getcwd()

In [8]:
print(directory)

C:\Users\anna_\python_source\вкр\bachelor_thesis\dataset\fit_language_models\pdf


In [9]:
files = os.listdir(directory)
files = list(filter(lambda x: x.endswith('.pdf'), files))

In [10]:
print(files)

['1.pdf', '10149-32546-2-PB.pdf', '10296-32679-2-PB.pdf', '10333-32678-2-PB.pdf', '10357-32680-2-PB.pdf', '10364-32417-2-PB.pdf', '107_BNI.pdf', '11825-33574-2-PB.pdf', '12200-32418-2-PB.pdf', '12212-32416-2-PB.pdf', '12223-32450-2-PB.pdf', '12233-32547-3-PB.pdf', '12273-33580-2-PB.pdf', '12277-33576-2-PB.pdf', '12285-33570-2-PB.pdf', '12310-33578-2-PB.pdf', '12359-32781-2-PB.pdf', '12360-33572-2-PB.pdf', '12376-32681-2-PB.pdf', '12399-33573-2-PB.pdf', '12413-34474-2-PB.pdf', '12416-33571-2-PB.pdf', '12446-33579-2-PB.pdf', '12458-32677-2-PB.pdf', '2.pdf', '3.pdf', '4.pdf', '5.pdf', '6.pdf', '7.pdf', '8.pdf', '9.pdf', 'avidreaders.ru__diabet-mify-i-realnost.pdf', 'Endokrinologia_Potemkin_V_V_1986.pdf', 'Внутричерепная_гипертензия_Ошоров_А_В_,_Савин_И_А_,_Горячев_А_С.pdf', 'Патофизиология_Том_2_5_е_издание_Новицкий_В_В_,_Уразова_О_И_2020.pdf']


In [11]:
f = open('text_for_embedding.txt', 'w')

for file_name in files:
    text = extract_text_from_pdf(file_name)
    text_to_file(text, f)

f.close()