### Getting text from multi-column pdf

In [None]:
!pip3 install sentence-transformers

In [1]:
!pip3 install nltk



In [None]:
!pip3 install pymupdf

In [12]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/akshat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/akshat/nltk_data...


True

In [4]:
import fitz
DIGITIZED_FILE = "Zanzibar.pdf"

In [5]:
from operator import itemgetter


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [6]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [7]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"

                header_para.append(block_string)

    return header_para

In [8]:
doc = fitz.open(DIGITIZED_FILE)
font_style, styles = fonts(doc)
size_tag = font_tags(font_style, styles)
headers_para = headers_para(doc, size_tag)

In [9]:
headings = []
paragraphs = []
for text in headers_para:
    if text.startswith("<h"):
        headings.append(text[3:])
    elif text.startswith("<s"):
        pass
    else:
        if text.endswith("|"):
            paragraphs.append(text[3:-1])
        else:
            paragraphs.append(text[3:])

In [20]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

 
class LemmatizerHelper(object):
    """
    Class to aid the lemmatization process - from word to stemmed form,
    and vice versa.
    The 'original' form of a lemmatized word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the lemmatized
    #words
    word_lookup = {}
 
    @classmethod
    def lemmatize(cls, sentence):
        """
        Lemmatize a sentence and updates the reverse lookup.
        """
 
        #Lemmatize the word
        lemmatized = [wnl.lemmatize(word) for word in sentence.split()]
        lemmatized = " ".join(lemmatized)
 
        return lemmatized
 
    @classmethod
    def original_form(cls, word):
        """
        Returns original form of a word given the lemmatized version,
        as stored in the word lookup.
        """
 
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word

In [22]:
lemmatized_paras = []
for i in range(len(paragraphs)):
    if paragraphs[i] != "":
        lemmatized_paras.append(LemmatizerHelper.lemmatize(paragraphs[i]))
print(lemmatized_paras)

['Determining whether online user are authorized to access digital object is central to preserving privacy. This pa- per present the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- ce control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policy from hundred of client service at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decision respect causal or- dering of user action and thus provide external consistency amid change to access control list and object contents. Zanzibar scale to trillion of access control list and million of authorization request per second to support service used by billion of people. It ha maintained 95th-percentile la- tency of le than 10 millisecond and availability of greater than 99.999% over 3 year of production use.', 'Many online interaction require authorization check to confirm that a user ha 

In [None]:
# from gensim.parsing.preprocessing import remove_stopwords
# word_list = []
# for paragraph in stemmed_paras:
#     if paragraph != '':
#         filtered_sentence = remove_stopwords(paragraph)
#         temp = filtered_sentence.split(" ")
#         word_list.append(temp)


In [None]:
# from gensim.models import Word2Vec
# min_count = 3
# size = 100
# window = 4
 
# model = Word2Vec(word_list, min_count=min_count, vector_size=size, window=window)

In [None]:
# key_terms = list(model.wv.index_to_key)
# key_terms

In [None]:
# new_key_terms = []
# for i in key_terms:
#     if len(i) >= 3:
#         new_key_terms.append(i)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


class KWE:
    # text="""Determining whether online users are authorized to access digital objects is central to preserving privacy. This pa- per presents the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- cess control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. Zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. It has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use."""

    def __init__(self,t):
        self.text=t

    def keywordExtract(self):
        n_gram_range = (1, 1)
        stop_words = "english"
        count = CountVectorizer(ngram_range=n_gram_range,
                                stop_words=stop_words).fit(self.text)

        candidates = count.get_feature_names_out()

        model = SentenceTransformer('all-mpnet-base-v2')
        doc_embedding = model.encode(self.text)
        candidate_embeddings = model.encode(candidates)

        top_n = 50
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0]][-top_n:]

        print(keywords)

        print(type(keywords))

        s = ",".join(keywords)

        print(s)
        return s
    # keywrdextract(data)
# if _name_ == "_main_":
c = KWE(lemmatized_paras)
c.keywordExtract()

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 466kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 83.1kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 2.40MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 762kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 52.2kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 8.16MB/s]
Downloading: 100%|██████████| 438M/438M [01:37<00:00, 4.51MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 20.3kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 71.7kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 267kB/s] 
Downloading: 100%|██████████| 363/363 [00:00<00:00, 116kB/s]
Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 3.12MB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 197kB/s]  
Downloading: 100%|██████████| 349/349 [00:00<00:00, 99.1kB/s]


['opengroup', 'terabytes', 'compute', 'policy', 'quota', 'usersets', 'caching', 'users', 'metadata', 'data', 'public', '401', 'hashing', 'cloud', 'authorized', 'hosting', 'permis', 'computing', 'google', 'internet', 'azure', 'safeguard', 'tor', 'protection', 'aws', 'acl', 'storage', 'cache', 'http', 'autho', 'decentral', 'aclservers', 'database', 'datastore', 'permission', 'https', 'permissions', 'access', 'api', 'databases', 'apis', 'acls', 'cacheability', 'authorize', 'acl_details', 'authentication', 'privacy', 'authorization', 'security', 'authorizations']
<class 'list'>
opengroup,terabytes,compute,policy,quota,usersets,caching,users,metadata,data,public,401,hashing,cloud,authorized,hosting,permis,computing,google,internet,azure,safeguard,tor,protection,aws,acl,storage,cache,http,autho,decentral,aclservers,database,datastore,permission,https,permissions,access,api,databases,apis,acls,cacheability,authorize,acl_details,authentication,privacy,authorization,security,authorizations


'opengroup,terabytes,compute,policy,quota,usersets,caching,users,metadata,data,public,401,hashing,cloud,authorized,hosting,permis,computing,google,internet,azure,safeguard,tor,protection,aws,acl,storage,cache,http,autho,decentral,aclservers,database,datastore,permission,https,permissions,access,api,databases,apis,acls,cacheability,authorize,acl_details,authentication,privacy,authorization,security,authorizations'