In [2]:
objective = 'Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document'
print(objective)

Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document


### Extracting Paragraphs from the EU Taxonomy Document

In [4]:
pdf_path = r'https://lp-prod-resources.s3.us-west-2.amazonaws.com/214/200309-sustainable-finance-teg-final-report-taxonomy-annexes_en.pdf'

In [None]:
!pip install textract

In [6]:
import textract
import requests
import re
import pandas as pd

In [8]:
filename = "taxonamy.pdf"

In [None]:
response = requests.get(pdf_path)
#filepath = os.path.join("PDFs", filename)
with open(filename, "wb") as f:
    f.write(response.content)

In [10]:
text = textract.process(filename).decode('utf-8').strip()

In [12]:
len(text)

1320993

In [14]:
text[0:200]

'Updated methodology & Updated Technical Screening Criteria\n- 1-\n\nMarch 2020\n\n\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: F'

In [16]:
text[200:400]

'ull list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A\n\nExplanation of the Taxonomy approach. This sec'

In [18]:
paragraphs = re.split(r"\s*?\n\s*?\n\s*?", text)

In [20]:
len(paragraphs)

8983

In [22]:
paragraphs[0:5]

['Updated methodology & Updated Technical Screening Criteria\n- 1-',
 'March 2020',
 '\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: Full list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A',
 'Explanation of the Taxonomy approach. This section sets out the role and importance of\nsustainable finance in Europe from a policy and investment perspective, the rationale for\nthe development of an EU Taxonomy, the daft regulation and the mandate of the TEG.',
 'PART B']

In [24]:
paragraphs = [p for p in paragraphs if len(p) > 150]

In [26]:
len(paragraphs)

1952

In [28]:
paragraphs[0:5]

['\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: Full list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A',
 'Explanation of the Taxonomy approach. This section sets out the role and importance of\nsustainable finance in Europe from a policy and investment perspective, the rationale for\nthe development of an EU Taxonomy, the daft regulation and the mandate of the TEG.',
 'Methodology. This explains the methodologies for developing technical screening\ncriteria for climate change mitigation objectives, adaptation objectives and ‘do no\nsignificant harm’ to other environmental objectives in the legislative proposal.\nThis has been updated since 2019.',
 'Next steps for the Taxonomy. This section elaborates on unresolved issues and\npotential ways forward for the Taxonomy and the technical work of the Platform o

In [30]:
def clean_paragraph(text):
    text = text.replace("\n", " ").replace("  ", " ").strip(" ")
    return re.sub(r'[^\w\s]', '', text).strip(" ")

In [32]:
df_paragraphs = pd.DataFrame(paragraphs, columns=['paragraph'])

In [34]:
df_paragraphs.head()

Unnamed: 0,paragraph
0,About this report\nThis document includes an ...
1,Explanation of the Taxonomy approach. This sec...
2,Methodology. This explains the methodologies f...
3,Next steps for the Taxonomy. This section elab...
4,Full list of technical screening criteria. Thi...


In [36]:
df_paragraphs.shape

(1952, 1)

In [38]:
df_paragraphs = df_paragraphs['paragraph'].apply(clean_paragraph)

In [40]:
df_paragraphs.head()

0    
About this report This document includes an u...
1    Explanation of the Taxonomy approach This sect...
2    Methodology This explains the methodologies fo...
3    Next steps for the Taxonomy This section elabo...
4    Full list of technical screening criteria This...
Name: paragraph, dtype: object

In [42]:
df_paragraphs.shape

(1952,)

### Question Paragraph Matching

#### Build a text vectorizer that finds the best matching paragraph for the provided set of questions and qualitatively evaluates the results

In [44]:
questions = [
    ["What fuel is used for manufacturing of chlorine?"],
    ["What metric is used for evaluating emission?"],
    ["How can carbon emission of the processes of cement clinker be reduced?"],
    ["How is the Weighted Cogeneration Threshold calculated?"],
    ["What is carbon capture and sequestration?"],
    ["What stages does CCS consist of?"],
    ["What should be the average energy consumption of a water supply system?"],
    ["What are examples of sludge treatments?"],
    ["How is the process of anaerobic digestion?"],
    ["How is reforestation defined?"],
    ["What is the threshold of emssion for inland passenger water transport?"], 
    ["What are the requirements of reporting for electricity generation from natural gas where there might be fugative emissions?"]
]

In [46]:
questions[0]

['What fuel is used for manufacturing of chlorine?']

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
#Initiate a TF-IDF model trained on the paragraphs from the previous milestone by using the TfidfVectorizer class 
# from the scikit-learn library. 
#This model will provide a representation for each paragraph or each question.

In [52]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_paragraphs)
tfidf_tokens = vectorizer.get_feature_names_out()
#print(X)
#print(vectorizer.get_feature_names_out())
print(X.shape)
print(len(tfidf_tokens))

(1952, 6708)
6708


In [54]:
from sklearn.metrics.pairwise import linear_kernel

In [56]:
vecs = []
for q in questions:
    vec = vectorizer.transform(q)
    rank = linear_kernel(vec,X).flatten()
    vecs.append((q,df_paragraphs[rank.argsort()[-1]]))

In [58]:
vecs[2]

(['How can carbon emission of the processes of cement clinker be reduced?'],
 'Thresholds for cement Clinker A are applicable to plants that produce clinker only and do not produce finished cement All other plants need to meet the thresholds for cement or alternative binder A Cement clinker Specific emissions calculated according to the methodology used for EUETS benchmarks associated to the clinker production processes are lower than the value of the related EUETS benchmark As of February 2020 the EUETS benchmark value for cement clinker manufacturing is 0766 tCO2et of clinker198 B Cement Specific emissions associated to the clinker and cement production processes are lower than 0498 tCO2et of cement or alternative binder 199')