In [186]:
objective = 'Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document'
print(objective)

Process the EU sustainable finance taxonomy PDF file and extract and clean all the paragraphs in the document


### Extracting Paragraphs from the EU Taxonomy Document

In [2]:
pdf_path = r'https://lp-prod-resources.s3.us-west-2.amazonaws.com/214/200309-sustainable-finance-teg-final-report-taxonomy-annexes_en.pdf'

In [None]:
!pip install textract

In [4]:
import textract
import requests
import re
import pandas as pd

In [5]:
filename = "taxonamy.pdf"

In [None]:
response = requests.get(pdf_path)
#filepath = os.path.join("PDFs", filename)
with open(filename, "wb") as f:
    f.write(response.content)

In [8]:
text = textract.process(filename).decode('utf-8').strip()

In [9]:
len(text)

1320993

In [10]:
text[0:200]

'Updated methodology & Updated Technical Screening Criteria\n- 1-\n\nMarch 2020\n\n\x0cAbout this report\nThis document includes an updated Part B: Methodology from the June 2019 report and an updated Part\nF: F'

In [11]:
text[200:400]

'ull list of technical screening criteria. The other original sections from the June 2019 report can be\nfound as labelled in the June 2019 report.\nPART A\n\nExplanation of the Taxonomy approach. This sec'

In [12]:
paragraphs = re.split(r'\n', text)

In [13]:
len(paragraphs)

34643

In [14]:
paragraphs[0:5]

['Updated methodology & Updated Technical Screening Criteria',
 '- 1-',
 '',
 'March 2020',
 '']

In [15]:
stripped = [x for x in paragraphs if x != '']

In [16]:
len(stripped)

25661

In [17]:
df_paragraphs = pd.DataFrame(stripped, columns=['paragraph'])

In [18]:
df_paragraphs.head()

Unnamed: 0,paragraph
0,Updated methodology & Updated Technical Screen...
1,- 1-
2,March 2020
3,About this report
4,This document includes an updated Part B: Meth...


In [19]:
df_paragraphs.shape

(25661, 1)

In [20]:
df_paragraphs['len'] = df_paragraphs['paragraph'].apply(lambda x: len(x)) 

In [21]:
df_paragraphs['len'].describe()

count    25661.000000
mean        50.128639
std         38.260038
min          1.000000
25%         11.000000
50%         54.000000
75%         80.000000
max        307.000000
Name: len, dtype: float64

In [22]:
df_paragraphs[ df_paragraphs['len'] == 1].shape

(2526, 2)

In [23]:
df_paragraphs[ df_paragraphs['len'] == 1].head(10)

Unnamed: 0,paragraph,len
467,2,1
669,•,1
671,•,1
894,,1
896,,1
898,,1
930,,1
937,,1
943,,1
984,,1


In [24]:
df_paragraphs[ df_paragraphs['len'] == 6].head()

Unnamed: 0,paragraph,len
7,PART A,6
11,PART B,6
16,PART C,6
19,PART D,6
22,PART E,6


In [25]:
df_paragraphs[ df_paragraphs['len'] <= 15 ]

Unnamed: 0,paragraph,len
1,- 1-,4
2,March 2020,10
7,PART A,6
11,PART B,6
16,PART C,6
...,...,...
25650,NACE CODE,9
25654,Wind-related,12
25657,Heat Stress,11
25659,Wildfire,8


In [26]:
df_paragraphs = df_paragraphs[ df_paragraphs['len'] >= 15 ]

In [27]:
df_paragraphs

Unnamed: 0,paragraph,len
0,Updated methodology & Updated Technical Screen...,58
3,About this report,18
4,This document includes an updated Part B: Meth...,99
5,F: Full list of technical screening criteria. ...,106
6,found as labelled in the June 2019 report.,42
...,...,...
25652,"Sewerage, Waste Management",26
25653,and Remediation Activities,26
25655,Changing Wind Patterns,22
25656,Temperature-related,19


In [28]:
df_paragraphs['len'].describe()

count    17986.000000
mean        69.105805
std         29.594580
min         15.000000
25%         49.000000
50%         72.000000
75%         92.000000
max        307.000000
Name: len, dtype: float64

In [29]:
df_paragraphs.shape

(17986, 2)

### Question Paragraph Matching

#### Build a text vectorizer that finds the best matching paragraph for the provided set of questions and qualitatively evaluates the results

In [137]:
questions = [
'What fuel is used for the manufacturing of chlorine?',
'What metric is used for evaluating emission?',
'How can carbon emission of the processes of cement clinker be reduced?',
'How is the Weighted Cogeneration Threshold calculated?',
'What are carbon capture and sequestration?',
'What stages does CCS consist of?',
'What should be the average energy consumption of a water supply system?',
'What are sludge treatments? -What is the process of anaerobic digestion?',
'How is reforestation defined?',
'What is the threshold of emission for inland passenger water transport?',
'What are the requirements of reporting for electricity generation from natural gas where there might be fugitive emissions?',
]

In [139]:
questions[0]

'What fuel is used for the manufacturing of chlorine?'

In [33]:
# text preprocessing
# Tokenization
# Lower casing
# Stop words removal
# Stemming
# Lemmatization

In [34]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
nltk.download('stopwords', quiet=True)
stopwords = nltk.corpus.stopwords.words('english')

In [58]:
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [61]:
def tokenization(text):
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text.lower())
    return tokens

In [62]:
def remove_punctuation(tokens):
    words=[word.lower() for word in tokens if word.isalpha()]
    return words

In [63]:
def remove_stopwords(tokens):
    tokens = [x for x in tokens if x not in stopwords]
    return tokens

In [83]:
def tfidif_vectorize(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X

In [65]:
#sentence = "The faster Harry got to the store, the faster Harry, the faster, would get home."
sentence = df_paragraphs['paragraph'].iloc[2]
tokens = tokenization(sentence)
print(tokens)
tokens_ohne_stopwords = remove_stopwords(tokens)
print(tokens_ohne_stopwords)

['this', 'document', 'includes', 'an', 'updated', 'part', 'b', ':', 'methodology', 'from', 'the', 'june', '2019', 'report', 'and', 'an', 'updated', 'part']
['document', 'includes', 'updated', 'part', 'b', ':', 'methodology', 'june', '2019', 'report', 'updated', 'part']


In [66]:
#Initiate a TF-IDF model trained on the paragraphs from the previous milestone by using the TfidfVectorizer class 
# from the scikit-learn library. 
#This model will provide a representation for each paragraph or each question.

In [67]:
df_paragraphs['tokens'] = df_paragraphs['paragraph'].apply(lambda x: tokenization(x))
df_paragraphs['tokens_ohne_punct'] = df_paragraphs['tokens'].apply(lambda x: remove_punctuation(x))
df_paragraphs['tokens_ohne_stopwords'] = df_paragraphs['tokens_ohne_punct'].apply(lambda x: remove_stopwords(x))

In [68]:
df_paragraphs.head(5)

Unnamed: 0,paragraph,len,tokens,tokens_ohne_punct,tokens_ohne_stopwords
0,Updated methodology & Updated Technical Screen...,58,"[updated, methodology, &, updated, technical, ...","[updated, methodology, updated, technical, scr...","[updated, methodology, updated, technical, scr..."
3,About this report,18,"[about, this, report]","[about, this, report]",[report]
4,This document includes an updated Part B: Meth...,99,"[this, document, includes, an, updated, part, ...","[this, document, includes, an, updated, part, ...","[document, includes, updated, part, b, methodo..."
5,F: Full list of technical screening criteria. ...,106,"[f, :, full, list, of, technical, screening, c...","[f, full, list, of, technical, screening, the,...","[f, full, list, technical, screening, original..."
6,found as labelled in the June 2019 report.,42,"[found, as, labelled, in, the, june, 2019, rep...","[found, as, labelled, in, the, june, report]","[found, labelled, june, report]"


In [70]:
#tfidf_tokens = vectorizer.get_feature_names_out()

In [71]:
df_paragraphs['token_count'] = df_paragraphs['tokens_ohne_stopwords'].apply(lambda x: len(x))

In [72]:
df_paragraphs = df_paragraphs[ df_paragraphs['token_count'] > 0]

In [109]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_paragraphs['paragraph'])
tfidf_tokens = vectorizer.get_feature_names_out()
#print(X)
#print(vectorizer.get_feature_names_out())
print(X.shape)
print(len(tfidf_tokens))

(17910, 6650)
6650


In [111]:
result = pd.DataFrame(
    data=X.toarray(), 
    columns=tfidf_tokens
)

print(result)

        00  000  0001  0003  0005  001  0013  001_099  0032   01  ...   yr  \
0      0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
1      0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
2      0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
3      0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
4      0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
...    ...  ...   ...   ...   ...  ...   ...      ...   ...  ...  ...  ...   
17905  0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
17906  0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
17907  0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
17908  0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   
17909  0.0  0.0   0.0   0.0   0.0  0.0   0.0      0.0   0.0  0.0  ...  0.0   

       zero  zerocarbon  zinc  zone  zoned  zones  zoning  zuri

In [163]:
#vectoreize questions
vectorizer_q = TfidfVectorizer()
X_q = vectorizer_q.fit_transform(questions)
tfidf_tokens_q = vectorizer_q.get_feature_names_out()
#print(X)
#print(vectorizer.get_feature_names_out())
print(X_q.shape)
print(len(tfidf_tokens_q))

(11, 61)
61


In [175]:
print(X[0])

  (0, 2034)	0.201342254386469
  (0, 5490)	0.2682293826770722
  (0, 6008)	0.32033041053257916
  (0, 4105)	0.3504068840299295
  (0, 6345)	0.8137061745984947


In [165]:
print(X_q[0])

  (0, 11)	0.4382994892503739
  (0, 37)	0.22065462509682585
  (0, 33)	0.4382994892503739
  (0, 51)	0.22065462509682585
  (0, 24)	0.2944440589341146
  (0, 56)	0.374642447248186
  (0, 32)	0.24161873100613704
  (0, 26)	0.4382994892503739
  (0, 59)	0.20216297316922405


In [167]:
from sklearn.metrics.pairwise import linear_kernel

In [173]:
linear_kernel(X,X_q)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 6650 while Y.shape[1] == 61