# Colab Notebook - Detect ESRS in a pdf




# Manage Packages

## Import standard package

In [27]:
import os
import time as time
import pandas as pd
import requests

In [28]:
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
warnings.simplefilter("ignore", InsecureRequestWarning)

# Use ESRS Propsector API

## Globals

In [29]:
# Get current dir
os.getcwd()

'/var/home/ikarius/Projects/portail-rse/portail-rse-externe/esg-api/notebook'

In [30]:
filename = "YOUR_PDF_FILE_NAME"

In [31]:
api_url = "https://YOUR_SERVERNAME.COM"

## Get API status

In [32]:
try:
  t0 = time.time()
  response = requests.get(api_url+"/ping", verify=False)
  print(response.json())
  print("       - Elapsed time :", round(time.time()-t0), "s")
except requests.exceptions.HTTPError as errh:
    print(errh)
except requests.exceptions.ConnectionError as errc:
    print(errc)
except requests.exceptions.Timeout as errt:
    print(errt)
except requests.exceptions.RequestException as err:
    print(err)

{'status': {'code': 0, 'msg': 'API is alive'}}
       - Elapsed time : 0 s


## Post a pdf file

### Send a PDF to API

In [33]:
filepath = './'+ filename +'.pdf'

try:

  t0 = time.time()
  url = api_url+'/upload'

  mp = {'file':(filepath, open(filepath, 'rb'), "multipart/form-data")}
  response = requests.post(url, files=mp, verify=False)

  resp_dict = response.json()
  pdfkey_dg = resp_dict.get('pdfkey')
  print()
  print("Status Code", response.status_code)
  print("JSON Response ", resp_dict)
  print('pdfkey : -->', pdfkey_dg ,"<--")
  print("       - Elapsed time :", round(time.time()-t0), "s")

except requests.exceptions.HTTPError as errh:
    print("1",errh)
except requests.exceptions.ConnectionError as errc:
    print("2",errc)
except requests.exceptions.Timeout as errt:
    print("3",errt)
except requests.exceptions.RequestException as err:
    print("4",err)


Status Code 200
JSON Response  {'pdfkey': 'h_q0ivao', 'status': {'code': 0, 'msg': 'PDF is uploaded'}}
pdfkey : --> h_q0ivao <--
       - Elapsed time : 4 s


In [34]:
# List of pdf_keys and pdf documents (only one here)
pdfkeys = [pdfkey_dg]
pdfnames = [filename]

## Convert PDFs to texts

In [35]:
for k in pdfkeys:

  url = api_url+"/pdf2txt?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)



{'nbtexts': 394, 'status': {'code': 0, 'msg': 'PDF is converted to TXT'}}
       - Elapsed time : 1 s


## Predict ESRS from Texts (in background)


In [43]:
for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/esrspredict?pdfkey="+k
  
  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())    
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : h_q0ivao
{'status': {'code': 0, 'msg': 'Task started'}}
       - Elapsed time : 1 s


## Check the pdfkey activity

In [44]:
for k in pdfkeys:

  url = api_url+"/checkactivetask?pdfkey="+k
  print(url+"\n")

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


https://ns31445064.ip-141-94-97.eu/checkactivetask?pdfkey=h_q0ivao

{'status': {'code': 0, 'msg': 'Task is checked'}, 'task_status': '1 active'}
       - Elapsed time : 1 s


## Check the number of tasks in background

In [47]:
url = api_url+"/getnbactivetasks"
print(url+"\n")

try:
  t0 = time.time()
  response = requests.get(url, verify=False)
  print(response.json())
  print("       - Elapsed time :", round(time.time()-t0), "s")

except requests.exceptions.HTTPError as errh:
    print(errh)
except requests.exceptions.ConnectionError as errc:
    print(errc)
except requests.exceptions.Timeout as errt:
    print(errt)
except requests.exceptions.RequestException as err:
    print(err)

https://ns31445064.ip-141-94-97.eu/getnbactivetasks

{'nb_tasks': '0', 'status': {'code': 0, 'msg': 'Number of tasks returned'}}
       - Elapsed time : 0 s


## Get Texts

In [48]:
from io import StringIO

texts_pd_key = {}

for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/gettxtfile?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    data = StringIO(str(response.content,'utf-8'))
    texts_pd_key[k] = pd.read_csv(data)
    print("   - nb texts :", len(texts_pd_key[k]))
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : h_q0ivao
   - nb texts : 394
       - Elapsed time : 0 s


In [49]:
texts_pd_key.get(pdfkeys[0])#.TEXTS

Unnamed: 0,PAGES,TEXTS
0,2,Cette année s’est avérée être une périodede dé...
1,2,"Dans cet environnement complexe, notre Groupe ..."
2,2,"Depuis deux décennies, notre participation au ..."
3,2,La signature d’un accord de transition écologi...
4,2,Les récentes inondationsque nous avons subies ...
...,...,...
389,82,Ce rapport développement durable publié en jui...
390,82,Bien que la rédaction de ce rapport soit un ex...
391,82,"Arc Holdings S.A.S., dont le siège est situé à..."
392,82,Les principes fondamentaux de définition et de...


## Get predictions



In [51]:
from io import StringIO

preds_pd_key = {}

for k in pdfkeys:
  print()
  print(" - pdf_key :", k)

  df_pdf_key = pd.DataFrame([])

  url = api_url+"/getpredsfile?pdfkey="+k

  try:
    t0 = time.time()
    response = requests.get(url, verify=False)
    data = StringIO(str(response.content,'utf-8'))
    df = pd.read_csv(data)
    print("       - Elapsed time :", round(time.time()-t0), "s")
    print(df.head())

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : h_q0ivao
       - Elapsed time : 0 s
   PAGES                                              TEXTS  \
0      2  Cette année s’est avérée être une périodede dé...   
1      2  Dans cet environnement complexe, notre Groupe ...   
2      2  Depuis deux décennies, notre participation au ...   
3      2  La signature d’un accord de transition écologi...   
4      2  Les récentes inondationsque nous avons subies ...   

                                    ESRS  
0                               Non ESRS  
1        ESRS G1 - Conduite des affaires  
2        ESRS G1 - Conduite des affaires  
3        ESRS E1 : Changement climatique  
4  ESRS E4 - Biodiversité et écosystèmes  


## Clean pdf_key


In [52]:
for k in pdfkeys:

  print()
  print(" - pdf_key :", k)

  url = api_url+"/clean?pdfkey="+k

  try:

    t0 = time.time()
    response = requests.get(url, verify=False)
    print(response.json())
    print("       - Elapsed time :", round(time.time()-t0), "s")

  except requests.exceptions.HTTPError as errh:
      print(errh)
  except requests.exceptions.ConnectionError as errc:
      print(errc)
  except requests.exceptions.Timeout as errt:
      print(errt)
  except requests.exceptions.RequestException as err:
      print(err)


 - pdf_key : h_q0ivao
{'status': {'code': 0, 'msg': 'Cleaning done'}}
       - Elapsed time : 0 s


# End of game