#### Extracting PDF files in 2 different ways:

In [13]:
import PyPDF2 as pdf

In [69]:
# Method 1:

pdf1 = open(r"D:\Jacobs\Semester 2\Advanced Project 1\03_BERT for ESG Ontology Development\03_ESG Reports_PDFDocuments\Manufacture\Henkel_2017-sustainability-report-data-unlocked.pdf", 'rb')

pdfReader = pdf.PdfFileReader(pdf1)

for i in range(pdfReader.getNumPages()): 
    page = pdfReader.getPage(i)
    print('Page No - ' + str(1+pdfReader.getPageNumber(page)))
    pagehandle = pdfReader.getPage(i)
    page_content = pagehandle.extractText()
    print(page_content)

Page No - 1
Sustainability Report 
2017
Page No - 2
About this report
Traditional navigation 
Read the Henkel Sustain
 ability  Report 
traditionally, like a book. Start at page ˜ 
and navigate  through until the end.Selective navigation 
Use the navigation bar to call up  specific chapters of interest. Within 
these, internal links have been pro-

vided to enable you to quickly switch 
to related content, while external 

links will take you to websites 
 offering complementary information.
Back and forward
Internal link(within this 
document)External link (outside of this 
document)Other features
Search
ContactMore information in  
the  annex  DownloadFor the best possible performance of the link function, download 

the PDF and open it in Adobe Acrobat.
Separate non-financial group report
The ˚˛˜˝ Sustainability Report is also the separate, combined 

non-financial group report for the Henkel Group and Henkel˙AG˙& 

Co. KGaA for fiscal year ˚˛˜˝ within the meaning of 
 Sections ˆ˜ˇb

#### Candidate Keywords/Keyphrases

Starting by creating a list of candidate keywords or keyphrases from a document. Although many focus on noun phrases, I'm going to keep it simple by using Scikit-Learns CountVectorizer. This allows us to specify the length of the keywords and make them into keyphrases. It also is a nice method for quickly removing stop words.

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
n_gram_range = (1,3) # the size of the resulting candidates
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([page_content])
candidates = count.get_feature_names()

In [76]:
candidates

['02',
 '02 18',
 '02 18 0except',
 '0except',
 '0except noted',
 '0except noted marks',
 '18',
 '18 0except',
 '18 0except noted',
 '1992',
 '1992 available',
 '1992 available downloads',
 '2018',
 '2018 henkel',
 '2018 henkel ag',
 '2018 pr',
 '2018 pr n0',
 '211',
 '211 797',
 '211 797 3533',
 '211 797 3680',
 '211 798',
 '211 798 4040',
 '211 798 9393',
 '22',
 '22 2018',
 '22 2018 pr',
 '3533',
 '3533 fax',
 '3533 fax 49',
 '3680',
 '3680 fax',
 '3680 fax 49',
 '40191',
 '40191 düsseldorf',
 '40191 düsseldorf germany',
 '4040',
 '4040 mail',
 '4040 mail corporate',
 '49',
 '49 211',
 '49 211 797',
 '49 211 798',
 '797',
 '797 3533',
 '797 3533 fax',
 '797 3680',
 '797 3680 fax',
 '798',
 '798 4040',
 '798 4040 mail',
 '798 9393',
 '798 9393 mail',
 '9393',
 '9393 mail',
 '9393 mail sustainability',
 'access',
 'access environment',
 'access environment reports',
 'accurate',
 'accurate future',
 'accurate future performance',
 'accurately',
 'accurately estimated',
 'accurately es

#### Embeddings

Converting both the document as well as the candidate keywords/keyphrases to numerical data. BERT for this purpose has shown great results for both similarity- and paraphrasing tasks.

I used the sentence-transformers package as it allows us to quickly create high-quality embeddings that work quite well for sentence- and document-level embeddings.

In [77]:
from sentence_transformers import SentenceTransformer

In [78]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([pagehandle.extractText()])
candidate_embeddings = model.encode(candidates)

#### Cosine Similarity

In the final step, I want to find the candidates that are most similar to the document. I assume that the most similar candidates to the document are good keywords/keyphrases for representing the document.

To calculate the similarity between candidates and the document, I will be using the cosine similarity between vectors as it performs quite well in high-dimensionality:

In [79]:
from sklearn.metrics.pairwise import cosine_similarity

In [80]:
top_n = 10 # number of keywords
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [81]:
keywords

['social media www',
 'facebook com henkel',
 'available ios android',
 'instagram com henkel',
 'ios android henkel',
 'sustainability management phone',
 'available downloads annual',
 'downloads annual reports',
 'henkel_2 www instagram',
 'sustainability publications internet']

#### Diversification

In [82]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [85]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n= 10, nr_candidates= 20)

['henkel www youtube',
 'publications website offers',
 'media www facebook',
 'available ios android',
 'ios android henkel',
 'sustainability management phone',
 'available downloads annual',
 'downloads annual reports',
 'henkel_2 www instagram',
 'sustainability publications internet']

In [53]:
# Method 2: 
# This method creates a txt file where the original PDF file is.

import pdfbox

In [56]:
p = pdfbox.PDFBox()
p.extract_text("C:/Users/a_erd/Desktop/03_BERT for ESG Ontology Development/03_ESG Reports_PDFDocuments/Manufacture/Henkel_2017-sustainability-report-data.pdf")

In [4]:
# Finding the number of words.

file = open("C:/Users/a_erd/Desktop/03_BERT for ESG Ontology Development/03_ESG Reports_PDFDocuments/Manufacture/Henkel_2017-sustainability-report-data.txt", "rt", encoding='utf-8')
data = file.read()
words = data.split()

print('Number of words in text file :', len(words))

Number of words in text file : 71550


In [5]:
# Installing PyTorch

In [6]:
#pip install torch torchvision

In [7]:
# Cleaning the data by making sentences into lower case and creating vocabulary

In [8]:
import re

In [9]:
sentences = re.sub("[.,!?\\-]", '', data.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))

In [10]:
word_list

['chapters',
 'barrier',
 'scientific',
 'sturdy',
 'rates',
 'financial',
 'absolutely',
 'hardwood',
 'collaborations',
 'treating',
 'nature',
 'opment',
 'st',
 'sizes',
 'happy',
 'port',
 'patients',
 'plantations',
 '5390economic',
 'offer:',
 'ironing',
 'prime',
 'ewc',
 '“acting',
 'aircraft',
 'zengerling',
 'download',
 'trustees',
 '48)',
 '“traffic',
 '10073environmental',
 'beginning',
 'demonstrates',
 'slovakia',
 '7642',
 'tics',
 'shaping',
 'market\xad',
 'solely',
 'third\xadparty',
 'spots)',
 'landfilled:',
 'shampoo',
 'easy',
 '“demography',
 'intermodal',
 'mentor',
 'smart',
 'eating',
 'pallets',
 'call',
 'status',
 'sharing',
 'panels',
 'associ\xad',
 'historically',
 'online',
 'company',
 'costs',
 'questionnaire',
 'considerable',
 'kids”',
 'dermatological',
 'opening',
 'elearning',
 'g4hr10',
 'onboarding',
 'paid',
 'incentives',
 'encountering',
 '(euro)',
 'scrub',
 'colleague',
 '193',
 'g4hr3',
 'thousand',
 'furthermore',
 'via',
 'wireless',
