In [31]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [32]:
import pprint
from datetime import datetime
from pathlib import Path
import requests
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Index, analyzer
from elasticsearch_dsl.query import MultiMatch

### Download PDF

In [33]:
pdf = "Etude d'impact Projet PV GUELTAS.pdf"

if not Path(pdf).exists():
    # https://www.projets-environnement.gouv.fr/page/fiche/?q=recordsid:2019396615
    url = 'https://osmose.numerique.gouv.fr/front/publicLink/publicDownload.jsp?id=e1b3de34-46d2-4f71-b84e-cb3a081a55a8c2fe4367-d254-42df-8bfa-543f87ee4a91'
    with open("Etude d'impact Projet PV GUELTAS.pdf", 'wb') as f:
        f.write(requests.get(url).content)

In [34]:
import fitz
import itertools


def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

def get_text_blocks(doc, title, next_title):
    """
    Return an iterator with a tuple (x0, y0, x1, y1, text) for each the text block

    Args:
        pdf (str or fitz.Document): PDF document
        text_blocks (bool, default=True): add text blocks
    """
    for page_no in range(title['page'], next_title.get('page', doc.page_count - 1) + 1):
        page = doc.load_page(page_no)
        for x0, y0, x1, y1, text, block_no, block_type in sorted(page.get_text('blocks'), key=lambda x: x[1]):
            if block_type != 0 or (page_no == title['page'] and y1 < title['to'].y):
                continue
            if next_title and page_no == next_title['page'] and y1 >= next_title['to'].y:
                return
            yield x0, y0, x1, y1, text
    
def get_toc(pdf, add_text_blocks=True):
    """
    Return the a list with the table of contents of the PDF document. 
    Each entry is a dictionary containing: 
        level (int), title (str), page (str), to (Point), all_titles (list with title hierarchy), [text_blocks (list)]

    Args:
        pdf (str or fitz.Document): PDF document
        add_text_blocks (bool, default=True): add text blocks
    """
    doc = pdf if isinstance(pdf, fitz.Document) else fitz.open(pdf)
    toc = [{'level': item[0], 'title': item[1], **item[3]} for item in doc.get_toc(simple=False)]

    all_titles = []
    for ititle, (title, next_title) in enumerate(pairwise(toc + [{}])): # add empty title for last one
        if add_text_blocks:
            text_blocks = list(get_text_blocks(doc, title, next_title))
            toc[ititle]['text_blocks'] = text_blocks
        all_titles = [j for i, j in enumerate(all_titles) if i < title['level'] - 1] + [title['title']]
        toc[ititle]['all_titles'] = all_titles
    return toc

toc = get_toc(pdf)

In [35]:
for item in toc[:10]:
    for level, title in enumerate(item['all_titles']):
        print('\t'*level + '*** ' + title)
    print('\n'.join(i[-1] for i in item['text_blocks']))
    print('_.'*60)

*** 1 RESUME NON TECHNIQUE
1 
RESUME NON TECHNIQUE 

_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.
*** 1 RESUME NON TECHNIQUE
	*** 1.1 Présentation synthétique du projet
1.1 Présentation synthétique du projet 

_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.
*** 1 RESUME NON TECHNIQUE
	*** 1.1 Présentation synthétique du projet
		*** 1.1.1 Localisation
1.1.1 Localisation 

La zone d’implantation du projet se situe sur la commune de Gueltas, dans le département du 
Morbihan (56), en région Bretagne. 
Le site d’implantation se situe sur une zone remaniée du site de stockage des déchets à Gueltas. 
Ce dôme, qui a été exploité par Suez Recyclage et Valorisation (Suez RV) de 1995 à 2006 est 
aujourd’hui en suivi d’exploitation. A noter qu’un nouveau site de stockage est en cours 
d’exploitation dans la continuité de ce dôme, à l’Ouest du site. 


In [None]:
# Create a tiny PDF to test the visualisation

with fitz.open("Etude d'impact Projet PV GUELTAS.pdf") as doc1, fitz.open() as doc2:
    doc2.insert_pdf(doc1, from_page=0, to_page = 1)  # first 2 pages
    doc2.save("tmp.pdf")

### Elastic search client and configuration

In [50]:
es = Elasticsearch('http://elasticsearch-master:9200')
print(es)
print(es.info())

index = Index('pdf-index')
index.analyzer(analyzer('default', 'french'))

# delete the index, ignore if it doesn't exist
index.delete(using=es, ignore=404)

# create the index in elasticsearch
index.create(using=es)

<Elasticsearch([{'host': 'elasticsearch-master', 'port': 9200}])>
{'name': 'elasticsearch-master-1', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'bFpuY_V7RHSbzX5GRz6m6Q', 'version': {'number': '7.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c5f60e894ca0c61cdbae4f5a686d9f08bcefc942', 'build_date': '2021-07-14T18:33:36.673943207Z', 'build_snapshot': False, 'lucene_version': '8.8.2', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}





{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pdf-index'}

In [51]:
for index, item in enumerate(toc):
    doc = {
        'title': item['title'],
        'all_titles': item['all_titles'],
        'page': item['page'], # all pages for viewing the pdf ?
        'text': '\n'.join(i[-1] for i in item['text_blocks']),
    }
    resp = es.index(index="pdf-index", id=index, document=doc)
    print(resp['result'], end=' ')

print()

created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created created 

In [58]:
m = MultiMatch(query="électricité",
               fields=["title^3", "text"],  # multiplies title score by 3
               fuzziness="AUTO")
s = Search(using=es, index="pdf-index").query(m).highlight("text", fragment_size=200)
response = s.execute()
nhits = response.hits.total.value

print('Total hits:', nhits, '\n')
for hit in response:
    pprint.pprint(hit)

print(response[0].meta.highlight['text'])

Total hits: 15 

<Hit(pdf-index/7): {'title': '2.1 Présentation générale d’ENGIE', 'all_titles':...}>
<Hit(pdf-index/14): {'title': '3.3 Généralités – Principe de base du photovoltaï...}>
<Hit(pdf-index/16): {'title': '3.4.1 Implantation', 'all_titles': ['3 DESCRIPTIO...}>
<Hit(pdf-index/60): {'title': '5.1.2.3 Le Schéma Régional Climat Air Énergie (SR...}>
<Hit(pdf-index/140): {'title': '8.1.1.2 Phase d’exploitation : effets permanents'...}>
<Hit(pdf-index/47): {'title': '3.9 Réglementation applicable', 'all_titles': ['3...}>
<Hit(pdf-index/56): {'title': '5.1.1 Contexte énergétique et réglementaire à l’é...}>
<Hit(pdf-index/66): {'title': '6 DESCRIPTIONS DES ASPECTS PERTINENTS DE L’ETAT A...}>
<Hit(pdf-index/59): {'title': '5.1.2.2 La programmation pluriannuelle de l’énerg...}>
<Hit(pdf-index/13): {'title': '3.2 Nature et objet de l’opération', 'all_titles'...}>
['2.1 Présentation générale d’ENGIE \n\nENGIE est un acteur mondial de l’énergie et de la transition énergétique, expert da




### Test analyzer

In [55]:
from elasticsearch.client import IndicesClient
ic = IndicesClient(es)

ic.analyze(
    body={
        "analyzer": "french",
        # "tokenizer": "standard",
        # "filter":  [ "lowercase", "asciifolding" ],
        "text":     "L''électricité produite par mes panneaux solaires, jusqu'à ce que le soleil tombe'"
    }
)

{'tokens': [{'token': 'electricit',
   'start_offset': 3,
   'end_offset': 14,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'produit',
   'start_offset': 15,
   'end_offset': 23,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'paneau',
   'start_offset': 32,
   'end_offset': 40,
   'type': '<ALPHANUM>',
   'position': 5},
  {'token': 'solair',
   'start_offset': 41,
   'end_offset': 49,
   'type': '<ALPHANUM>',
   'position': 6},
  {'token': 'soleil',
   'start_offset': 69,
   'end_offset': 75,
   'type': '<ALPHANUM>',
   'position': 11},
  {'token': 'tomb',
   'start_offset': 76,
   'end_offset': 81,
   'type': '<ALPHANUM>',
   'position': 12}]}