In [1]:
%pip install elasticsearch==7.17.2 PyMuPDF==1.19.1 streamlit

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pprint
from datetime import datetime
from elasticsearch import Elasticsearch


es = Elasticsearch('http://elasticsearch-master:9200')
print(es)
print(es.info())

doc = {
    'author': 'author_name',
    'text': 'Interesting content...',
    'timestamp': datetime.now(),
}
resp = es.index(index="test-index", id=1, document=doc)
print(resp['result'])

<Elasticsearch([{'host': 'elasticsearch-master', 'port': 9200}])>
{'name': 'elasticsearch-master-2', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'bFpuY_V7RHSbzX5GRz6m6Q', 'version': {'number': '7.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c5f60e894ca0c61cdbae4f5a686d9f08bcefc942', 'build_date': '2021-07-14T18:33:36.673943207Z', 'build_snapshot': False, 'lucene_version': '8.8.2', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
updated




In [3]:
import requests

# https://www.projets-environnement.gouv.fr/page/fiche/?q=recordsid:2019396615
url = 'https://osmose.numerique.gouv.fr/front/publicLink/publicDownload.jsp?id=e1b3de34-46d2-4f71-b84e-cb3a081a55a8c2fe4367-d254-42df-8bfa-543f87ee4a91'
with open("Etude d'impact Projet PV GUELTAS.pdf", 'wb') as f:
    f.write(requests.get(url).content)

In [19]:
import fitz
import itertools


def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

def get_text_blocks(doc, title, next_title):
    """
    Return an iterator with a tuple (x0, y0, x1, y1, text) for each the text block

    Args:
        pdf (str or fitz.Document): PDF document
        text_blocks (bool, default=True): add text blocks
    """
    for page_no in range(title['page'], next_title.get('page', doc.page_count - 1) + 1):
        page = doc.load_page(page_no)
        for x0, y0, x1, y1, text, block_no, block_type in sorted(page.get_text('blocks'), key=lambda x: x[1]):
            if block_type != 0 or (page_no == title['page'] and y1 < title['to'].y):
                continue
            if next_title and page_no == next_title['page'] and y1 >= next_title['to'].y:
                return
            yield x0, y0, x1, y1, text
    
def get_toc(pdf, add_text_blocks=True):
    """
    Return the a list with the table of contents of the PDF document. 
    Each entry is a dictionary containing: 
        level (int), title (str), page (str), to (Point), all_titles (list with title hierarchy), [text_blocks (list)]

    Args:
        pdf (str or fitz.Document): PDF document
        add_text_blocks (bool, default=True): add text blocks
    """
    doc = pdf if isinstance(pdf, fitz.Document) else fitz.open(pdf)
    toc = [{'level': item[0], 'title': item[1], **item[3]} for item in doc.get_toc(simple=False)]

    all_titles = []
    for ititle, (title, next_title) in enumerate(pairwise(toc + [{}])): # add empty title for last one
        if add_text_blocks:
            text_blocks = list(get_text_blocks(doc, title, next_title))
            toc[ititle]['text_blocks'] = text_blocks
        all_titles = [j for i, j in enumerate(all_titles) if i < title['level'] - 1] + [title['title']]
        toc[ititle]['all_titles'] = all_titles
    return toc

toc = get_toc("Etude d'impact Projet PV GUELTAS.pdf")

In [20]:
for item in toc[:10]:
    for level, title in enumerate(item['all_titles']):
        print('\t'*level + '*** ' + title)
    print('\n'.join(i[-1] for i in item['text_blocks']))
    print('_.'*60)

*** 1 RESUME NON TECHNIQUE
1 
RESUME NON TECHNIQUE 

_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.
*** 1 RESUME NON TECHNIQUE
	*** 1.1 Présentation synthétique du projet
1.1 Présentation synthétique du projet 

_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.
*** 1 RESUME NON TECHNIQUE
	*** 1.1 Présentation synthétique du projet
		*** 1.1.1 Localisation
1.1.1 Localisation 

La zone d’implantation du projet se situe sur la commune de Gueltas, dans le département du 
Morbihan (56), en région Bretagne. 
Le site d’implantation se situe sur une zone remaniée du site de stockage des déchets à Gueltas. 
Ce dôme, qui a été exploité par Suez Recyclage et Valorisation (Suez RV) de 1995 à 2006 est 
aujourd’hui en suivi d’exploitation. A noter qu’un nouveau site de stockage est en cours 
d’exploitation dans la continuité de ce dôme, à l’Ouest du site. 


In [21]:
es.indices.delete(index='test-index', ignore=[400, 404])




{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [test-index]',
    'index_uuid': '_na_',
    'resource.type': 'index_or_alias',
    'resource.id': 'test-index',
    'index': 'test-index'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [test-index]',
  'index_uuid': '_na_',
  'resource.type': 'index_or_alias',
  'resource.id': 'test-index',
  'index': 'test-index'},
 'status': 404}

In [22]:
for index, item in enumerate(toc):
    doc = {
        'title': item['title'],
        'all_titles': item['all_titles'],
        'page': item['page'], # all pages for viewing the pdf ?
        'text': '\n'.join(i[-1] for i in item['text_blocks']),
    }
    resp = es.index(index="test-index1", id=index, document=doc)
    print(resp['result'], end=' ')

print()

updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated updated 

In [16]:
res = es.search(index="test-index1", 
                query={"multi_match": 
                        {"fields": ["title^3", "text"], # multiplies the title score by 3
                         "query": "paneaux", 
                         "fuzziness": "AUTO"
                        }
                       },
                highlight={"fields": {"text": {}}}
               )

print('Total hits:', res['hits']['total']['value'], '\n')
for hit in res['hits']['hits']:
    pprint.pprint(hit)

Total hits: 57 

{'_id': '17',
 '_index': 'test-index1',
 '_score': 7.482191,
 '_source': {'page': 29,
             'text': '3.4.2 \n'
                     'Les panneaux - modules \n'
                     '\n'
                     'Cette centrale de type « centrale au sol connectée au '
                     'réseau » sera équipée de panneaux à \n'
                     'structure fixe. Elle devrait comporter de l’ordre de 27 '
                     '780 modules d’une puissance de 435Wc \n'
                     'unitaire. \n'
                     'Pour ce projet, une technologie de panneaux de type '
                     'Silicium monocristallin (couche épaisse) est \n'
                     'envisagée à ce stade des études. Comparés à des '
                     'technologies moins chères de type couche \n'
                     'mince, les panneaux cristallins présentent un rendement '
                     'surfacique de 20 % supérieur. Pour une \n'
                     'production énergét

In [8]:
with fitz.open("Etude d'impact Projet PV GUELTAS.pdf") as doc1, fitz.open() as doc2:
    doc2.insert_pdf(doc1, from_page=0, to_page = 1)  # first 2 pages
    doc2.save("tmp.pdf")