In [4]:
objective = 'Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document'
print(objective)

Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document


### Extracting Paragraphs from the EU Taxonomy Document

In [7]:
pdf_path = r'https://lp-prod-resources.s3.us-west-2.amazonaws.com/214/200309-sustainable-finance-teg-final-report-taxonomy-annexes_en.pdf'

In [None]:
!pip install textract

In [5]:
import textract
import requests
import re
import pandas as pd

In [7]:
filename = "taxonamy.pdf"

In [None]:
response = requests.get(pdf_path)
#filepath = os.path.join("PDFs", filename)
with open(filename, "wb") as f:
    f.write(response.content)

In [9]:
text = textract.process(filename).decode('utf-8').strip()

In [10]:
len(text)

1320993

In [11]:
text[0:200]

'Updated methodology & Updated Technical Screening Criteria\n- 1-\n\nMarch 2020\n\n\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: F'

In [12]:
text[200:400]

'ull list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A\n\nExplanation of the Taxonomy approach. This sec'

In [13]:
paragraphs = re.split(r"\s*?\n\s*?\n\s*?", text)

In [14]:
len(paragraphs)

8983

In [15]:
paragraphs[0:5]

['Updated methodology & Updated Technical Screening Criteria\n- 1-',
 'March 2020',
 '\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: Full list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A',
 'Explanation of the Taxonomy approach. This section sets out the role and importance of\nsustainable finance in Europe from a policy and investment perspective, the rationale for\nthe development of an EU Taxonomy, the daft regulation and the mandate of the TEG.',
 'PART B']

In [16]:
paragraphs = [p for p in paragraphs if len(p) > 150]

In [17]:
len(paragraphs)

1952

In [18]:
paragraphs[0:5]

['\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: Full list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A',
 'Explanation of the Taxonomy approach. This section sets out the role and importance of\nsustainable finance in Europe from a policy and investment perspective, the rationale for\nthe development of an EU Taxonomy, the daft regulation and the mandate of the TEG.',
 'Methodology. This explains the methodologies for developing technical screening\ncriteria for climate change mitigation objectives, adaptation objectives and ‘do no\nsignificant harm’ to other environmental objectives in the legislative proposal.\nThis has been updated since 2019.',
 'Next steps for the Taxonomy. This section elaborates on unresolved issues and\npotential ways forward for the Taxonomy and the technical work of the Platform o

In [19]:
def clean_paragraph(text):
    text = text.replace("\n", " ").replace("  ", " ").strip(" ")
    return re.sub(r'[^\w\s]', '', text).strip(" ")

In [21]:
df_paragraphs = pd.DataFrame(paragraphs, columns=['paragraph'])

In [22]:
df_paragraphs.head()

Unnamed: 0,paragraph
0,About this report\nThis document includes an ...
1,Explanation of the Taxonomy approach. This sec...
2,Methodology. This explains the methodologies f...
3,Next steps for the Taxonomy. This section elab...
4,Full list of technical screening criteria. Thi...


In [23]:
df_paragraphs.shape

(1952, 1)

In [24]:
df_paragraphs = df_paragraphs['paragraph'].apply(clean_paragraph)

In [25]:
df_paragraphs.head()

0    
About this report This document includes an u...
1    Explanation of the Taxonomy approach This sect...
2    Methodology This explains the methodologies fo...
3    Next steps for the Taxonomy This section elabo...
4    Full list of technical screening criteria This...
Name: paragraph, dtype: object

In [26]:
df_paragraphs.shape

(1952,)

### Question Paragraph Matching

#### Build a text vectorizer that finds the best matching paragraph for the provided set of questions and qualitatively evaluates the results

In [29]:
questions = [
    ["What fuel is used for manufacturing of chlorine?"],
    ["What metric is used for evaluating emission?"],
    ["How can carbon emission of the processes of cement clinker be reduced?"],
    ["How is the Weighted Cogeneration Threshold calculated?"],
    ["What is carbon capture and sequestration?"],
    ["What stages does CCS consist of?"],
    ["What should be the average energy consumption of a water supply system?"],
    ["What are examples of sludge treatments?"],
    ["How is the process of anaerobic digestion?"],
    ["How is reforestation defined?"],
    ["What is the threshold of emssion for inland passenger water transport?"], 
    ["What are the requirements of reporting for electricity generation from natural gas where there might be fugative emissions?"]
]

In [30]:
questions[0]

['What fuel is used for manufacturing of chlorine?']

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
#Initiate a TF-IDF model trained on the paragraphs from the previous milestone by using the TfidfVectorizer class 
# from the scikit-learn library. 
#This model will provide a representation for each paragraph or each question.

In [42]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_paragraphs)
tfidf_tokens = vectorizer.get_feature_names_out()
#print(X)
#print(vectorizer.get_feature_names_out())
print(X.shape)
print(len(tfidf_tokens))

(1952, 6708)
6708


In [56]:
from sklearn.metrics.pairwise import linear_kernel

In [58]:
vecs = []
for q in questions:
    vec = vectorizer.transform(q)
    rank = linear_kernel(vec,X).flatten()
    vecs.append((q,df_paragraphs[rank.argsort()[-1]]))

In [60]:
vecs[2]

(['How can carbon emission of the processes of cement clinker be reduced?'],
 'Thresholds for cement Clinker A are applicable to plants that produce clinker only and do not produce finished cement All other plants need to meet the thresholds for cement or alternative binder A Cement clinker Specific emissions calculated according to the methodology used for EUETS benchmarks associated to the clinker production processes are lower than the value of the related EUETS benchmark As of February 2020 the EUETS benchmark value for cement clinker manufacturing is 0766 tCO2et of clinker198 B Cement Specific emissions associated to the clinker and cement production processes are lower than 0498 tCO2et of cement or alternative binder 199')

In [62]:
MODEL = "distilbert-base-uncased-distilled-squad"
TEST_SAMPLE_SIZE = 1000

In [64]:
from datasets import load_dataset
dataset = load_dataset('squad', split='train')

Found cached dataset squad (/home/b635040c-829a-4f40-8689-9aa2d4dd8b8f/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [65]:
import random
#import json

#with open("data/dev-v2.0.json") as f:
#    data = json.load(f)


In [68]:
def get_qustion_answers_context(data):
    # this function should provide tiples of quetion, answer and context from the data
    #return data['question'], data['answers'], data['context']
    ls = []
    for row in data:
        triple = row['question'], row['answers'], row['context']
        ls.append(triple)
    return ls

In [70]:
qac = random.sample(get_qustion_answers_context(dataset), TEST_SAMPLE_SIZE)

In [71]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=MODEL)

2023-10-30 18:26:12.665190: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-10-30 18:26:12.920438: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-10-30 18:26:12.952029: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-10-30 18:26:16.127578: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-10-30 18:26:16.250061: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestion

In [72]:
qac[0][1]

{'text': ['1934'], 'answer_start': [53]}

In [73]:
result = question_answerer(question=qac[0][0],     context=qac[0][2])

In [78]:
result['score']

0.9913541078567505

In [80]:
def get_em_scores(qac, qa_model):
    # This function should return a list of scores where it has gone through the triples of question, answer and contex and ran the model to see when it matches with the answer. 
    scores = []
    for q in qac:
        res = qa_model(question=q[0], context=q[2])
        scores.append(res['score'])
    return scores

In [None]:
get_em_scores(qac,question_answerer)

In [82]:

from transformers import pipeline
question_answerer = pipeline("question-answering", model=MODEL, tokenizer=MODEL, device=-1)


def get_answer_pipeline(question, context):
    result = question_answerer(auestion=question, context=context)
    return answer["answer"].rstrip(".").rstrip(",").lstrip("(").rstrip(")").rstrip(".").strip("'").strip(":")

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [85]:
scores = get_em_scores(qac, get_answer_pipeline)
print(sum(scores)/len(scores))

ValueError: Unknown arguments {'auestion': 'When did the first Archivist start at the National Archives?', 'context': 'The first Archivist, R.D.W. Connor, began serving in 1934, when the National Archives was established by Congress. As a result of a first Hoover Commission recommendation, in 1949 the National Archives was placed within the newly formed General Services Administration (GSA). The Archivist served as a subordinate official to the GSA Administrator until the National Archives and Records Administration became an independent agency on April 1, 1985.'}