# Colab Notebook - Detect ESRS in a pdf




# Manage Packages

## Import standard package

In [1]:
import os
import time as time
import pandas as pd
import requests



In [2]:
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
warnings.simplefilter("ignore", InsecureRequestWarning)

# Use ESRS Propsector API

## Globals

In [3]:
# Get current dir
os.getcwd()

'/Users/francoisbullier/Projets/esg-api/notebook'

In [4]:
filename = "YOUR_PDF_FILE_NAME"

In [6]:
api_url = "https://YOUR_SERVERNAME.COM"

## Get API status

In [7]:
try:
  t0 = time.time()
  response = requests.get(api_url+"/ping", verify=False)
  print(response.json())
  print("       - Elapsed time :", round(time.time()-t0), "s")
except requests.exceptions.HTTPError as errh:
    print(errh)
except requests.exceptions.ConnectionError as errc:
    print(errc)
except requests.exceptions.Timeout as errt:
    print(errt)
except requests.exceptions.RequestException as err:
    print(err)

{'status': {'code': 0, 'msg': 'API is alive'}}
       - Elapsed time : 0 s


## Post a pdf file

### Senf PDF file to API

In [8]:
filepath = './'+ filename +'.pdf'

try:

  t0 = time.time()
  url = api_url+'/upload'

  mp = {'file':(filepath, open(filepath, 'rb'), "multipart/form-data")}
  response = requests.post(url, files=mp, verify=False)

  resp_dict = response.json()
  pdfkey_dg = resp_dict.get('pdfkey')
  print()
  print("Status Code", response.status_code)
  print("JSON Response ", resp_dict)
  print('pdfkey : -->', pdfkey_dg ,"<--")
  print("       - Elapsed time :", round(time.time()-t0), "s")

except requests.exceptions.HTTPError as errh:
    print("1",errh)
except requests.exceptions.ConnectionError as errc:
    print("2",errc)
except requests.exceptions.Timeout as errt:
    print("3",errt)
except requests.exceptions.RequestException as err:
    print("4",err)


Status Code 200
JSON Response  {'pdfkey': '9swm9fzb', 'status': {'code': 0, 'msg': 'PDF is uploaded'}}
pdfkey : --> 9swm9fzb <--
       - Elapsed time : 3 s


In [9]:
# List of pdf_keys and pdf documents (only one here)
pdfkeys = [pdfkey_dg]
pdfnames = [filename]

## Convert PDFs to texts

In [10]:
for k in pdfkeys:

  url = api_url+"/pdf2txt?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)



{'nbtexts': 3572, 'status': {'code': 0, 'msg': 'PDF is converted to TXT'}}
       - Elapsed time : 3 s


## Predict ESRS from Texts


In [11]:
for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/esrspredict?pdfkey="+k
  
  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())    
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : 9swm9fzb
{'status': {'code': 0, 'msg': 'ESRS predicted with success'}}
       - Elapsed time : 180 s


## Get Texts

In [12]:
from io import StringIO

texts_pd_key = {}

for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/gettxtfile?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    data = StringIO(str(response.content,'utf-8'))
    texts_pd_key[k] = pd.read_csv(data)
    print("   - nb texts :", len(texts_pd_key[k]))
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : 9swm9fzb
   - nb texts : 3572
       - Elapsed time : 1 s


In [13]:
texts_pd_key.get(pdfkeys[0])#.TEXTS

Unnamed: 0,PAGES,TEXTS
0,3,Le document d’enregistrement universel a été d...
1,3,Le document d’enregistrement universel peut êt...
2,3,Le document d’enregistrement universel est une...
3,3,Publicis Groupe accompagne ses clients sur l’e...
4,3,Les clients sont au coeur du modèle du Groupe ...
...,...,...
3567,417,220344 ;les comptes annuels de la Société pour...
3568,418,La version numérique de ce document est confor...
3569,418,Accessible aux personnes déficientes visuelles...
3570,418,Ce document est imprimé en France par un impri...


## Get predictions



In [16]:
from io import StringIO

preds_pd_key = {}

for k in pdfkeys:
  print()
  print(" - pdf_key :", k)

  df_pdf_key = pd.DataFrame([])

  url = api_url+"/getpredsfile?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    data = StringIO(str(response.content,'utf-8'))
    df = pd.read_csv(data)
    print("       - Elapsed time :", round(time.time()-t0), "s")
    df.head()

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)

Unnamed: 0,PAGES,TEXTS,ESRS
0,3,Le document d’enregistrement universel a été d...,Non ESRS
1,3,Le document d’enregistrement universel peut êt...,Non ESRS
2,3,Le document d’enregistrement universel est une...,Non ESRS
3,3,Publicis Groupe accompagne ses clients sur l’e...,ESRS S4 - Consommateurs et utilisateurs finaux
4,3,Les clients sont au coeur du modèle du Groupe ...,ESRS S4 - Consommateurs et utilisateurs finaux


## Clean pdf_key


In [17]:
for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/clean?pdfkey="+k

  try:

    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : 9swm9fzb
{'status': {'code': 0, 'msg': 'Cleaning done'}}
       - Elapsed time : 0 s


# End of game