In [1]:
%pip install elasticsearch==7.17.2 PyMuPDF==1.19.1

Collecting elasticsearch==7.17.2
  Downloading elasticsearch-7.17.2-py2.py3-none-any.whl (385 kB)
     |████████████████████████████████| 385 kB 7.3 MB/s            
[?25hCollecting PyMuPDF==1.19.1
  Downloading PyMuPDF-1.19.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (8.7 MB)
     |████████████████████████████████| 8.7 MB 48.8 MB/s            
Installing collected packages: PyMuPDF, elasticsearch
Successfully installed PyMuPDF-1.19.1 elasticsearch-7.17.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pprint
from datetime import datetime
from elasticsearch import Elasticsearch


es = Elasticsearch('http://elasticsearch-master:9200')
print(es)
print(es.info())

doc = {
    'author': 'author_name',
    'text': 'Interesting content...',
    'timestamp': datetime.now(),
}
resp = es.index(index="test-index", id=1, document=doc)
print(resp['result'])

<Elasticsearch([{'host': 'elasticsearch-master', 'port': 9200}])>
{'name': 'elasticsearch-master-0', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'bFpuY_V7RHSbzX5GRz6m6Q', 'version': {'number': '7.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c5f60e894ca0c61cdbae4f5a686d9f08bcefc942', 'build_date': '2021-07-14T18:33:36.673943207Z', 'build_snapshot': False, 'lucene_version': '8.8.2', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
updated




In [3]:
import requests

# https://www.projets-environnement.gouv.fr/page/fiche/?q=recordsid:2019396615
url = 'https://osmose.numerique.gouv.fr/front/publicLink/publicDownload.jsp?id=e1b3de34-46d2-4f71-b84e-cb3a081a55a8c2fe4367-d254-42df-8bfa-543f87ee4a91'
with open("Etude d'impact Projet PV GUELTAS.pdf", 'wb') as f:
    f.write(requests.get(url).content)

In [4]:
import fitz
import itertools


def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

with fitz.open("Etude d'impact Projet PV GUELTAS.pdf") as doc:
    toc = doc.get_toc(False)
    titles = [t for t in toc if t[1][:4] == '3.4.']
    pages = [t[3]['page'] for t in titles]
    for title in titles:
        print(title)
    
    first, last = min(pages), max(pages)
    text = ""
    for page in doc:
        if first <= page.number <= last:
            text += page.get_text()

    # page = doc[26]
    # blocks = page.get_text("dict", flags=11)["blocks"]
    # for block in page.get_text("blocks", sort=False):
    #     if block[6] == 0 and block[4].strip(' \n'):
    #         print(block[4])
    #         print('-'*50)


title_start = [text.find(i[1].split()[0]) for i in titles]
blocks = [text[start:end] for start, end in pairwise(title_start)]
for block in blocks:
    print('-'*50)
    print(block[:200])

[3, '3.4.1 Implantation', 29, {'kind': 1, 'xref': 11838, 'page': 28, 'to': Point(85.0, 518.0), 'zoom': 0.0}]
[3, '3.4.2 Les panneaux modules', 30, {'kind': 1, 'xref': 11842, 'page': 29, 'to': Point(85.0, 569.0), 'zoom': 0.0}]
[3, '3.4.3 Les fondations', 32, {'kind': 1, 'xref': 11844, 'page': 31, 'to': Point(85.0, 579.0), 'zoom': 0.0}]
[3, '3.4.4 Les structures porteuses', 35, {'kind': 1, 'xref': 11846, 'page': 34, 'to': Point(85.0, 522.0), 'zoom': 0.0}]
[3, '3.4.5 Locaux techniques', 37, {'kind': 1, 'xref': 11848, 'page': 36, 'to': Point(85.0, 297.0), 'zoom': 0.0, 'collapse': True}]
[4, '3.4.5.1 Postes de transformation', 37, {'kind': 1, 'xref': 11850, 'page': 36, 'to': Point(85.0, 378.0), 'zoom': 0.0}]
[4, '3.4.5.2 Le poste de livraison', 38, {'kind': 1, 'xref': 11851, 'page': 37, 'to': Point(85.0, 356.0), 'zoom': 0.0}]
[3, '3.4.6 Clôture et système de surveillance', 41, {'kind': 1, 'xref': 11852, 'page': 40, 'to': Point(85.0, 120.0), 'zoom': 0.0, 'collapse': True}]
[4, '3.4.6.1 Clôtu

In [5]:
for index, (title, block) in enumerate(zip(titles, blocks)):
    doc = {
        'title': title[1],
        'page': title[2],
        'text': block,
    }
    resp = es.index(index="test-index1", id=index, document=doc)
    print(resp['result'], end=' ')

print()

updated updated updated updated updated updated updated updated updated updated updated 





In [6]:
res = es.search(index="test-index1", 
                query={"multi_match": 
                        {"fields": ["title^3", "text"], # multiplies the title score by 3
                         "query": "paneaux", 
                         "fuzziness": "AUTO"
                        }
                       },
                highlight={"fields": {"text": {}}}
               )

print('Total hits:', res['hits']['total']['value'], '\n')
for hit in res['hits']['hits']:
    pprint.pprint(hit)

Total hits: 8 

{'_id': '1',
 '_index': 'test-index1',
 '_score': 5.360285,
 '_source': {'page': 30,
             'text': '3.4.2 \n'
                     'Les panneaux - modules \n'
                     'Cette centrale de type « centrale au sol connectée au '
                     'réseau » sera équipée de panneaux à \n'
                     'structure fixe. Elle devrait comporter de l’ordre de 27 '
                     '780 modules d’une puissance de 435Wc \n'
                     'unitaire. \n'
                     'Pour ce projet, une technologie de panneaux de type '
                     'Silicium monocristallin (couche épaisse) est \n'
                     'envisagée à ce stade des études. Comparés à des '
                     'technologies moins chères de type couche \n'
                     'mince, les panneaux cristallins présentent un rendement '
                     'surfacique de 20 % supérieur. Pour une \n'
                     'production énergétique équivalente, le déploie