### Getting text from multi-column pdf

In [1]:
!pip3 install pymupdf



In [2]:
import fitz
DIGITIZED_FILE = "Zanzibar.pdf"

In [3]:
from operator import itemgetter


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [4]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [5]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"

                header_para.append(block_string)

    return header_para

In [6]:
doc = fitz.open(DIGITIZED_FILE)
font_style, styles = fonts(doc)
size_tag = font_tags(font_style, styles)
headers_para = headers_para(doc, size_tag)

In [7]:
headings = []
paragraphs = []
for text in headers_para:
    if text.startswith("<h"):
        headings.append(text[3:])
    elif text.startswith("<s"):
        pass
    else:
        if text.endswith("|"):
            paragraphs.append(text[3:-1])
        else:
            paragraphs.append(text[3:])

In [8]:
print(paragraphs)

['', '', '', '', '', '', 'Determining whether online users are authorized to access digital objects is central to preserving privacy. This pa- per presents the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- cess control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. Zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. It has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use.', '', '', 'Many online interacti

In [9]:
paragraphs[6]

'Determining whether online users are authorized to access digital objects is central to preserving privacy. This pa- per presents the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- cess control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. Zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. It has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use.'

In [10]:
from gensim.parsing import PorterStemmer
global_stemmer = PorterStemmer()
 
class StemmingHelper(object):
    """
    Class to aid the stemming process - from word to stemmed form,
    and vice versa.
    The 'original' form of a stemmed word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the stemmed
    #words
    word_lookup = {}
 
    @classmethod
    def stem(cls, word):
        """
        Stems a word and updates the reverse lookup.
        """
 
        #Stem the word
        stemmed = global_stemmer.stem(word)
 
        #Update the word lookup
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word, 0) + 1)
 
        return stemmed
 
    @classmethod
    def original_form(cls, word):
        """
        Returns original form of a word given the stemmed version,
        as stored in the word lookup.
        """
 
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word

In [11]:

stopwords = ["i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"her",
"hers",
"herself",
"it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"can",
"will",
"just",
"don",
"should",
"now",]


In [12]:
StemmingHelper.stem(paragraphs[6])

'determining whether online users are authorized to access digital objects is central to preserving privacy. this pa- per presents the design, implementation, and deployment of zanzibar, a global system for storing and evaluating ac- cess control lists. zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at google, including calendar, cloud, drive, maps, photos, and youtube. its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. it has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use.'

In [13]:
from gensim.parsing.preprocessing import remove_stopwords
word_list = []
for paragraph in paragraphs:
    if paragraph != '':
        filtered_sentence = remove_stopwords(paragraph)
        temp = filtered_sentence.split(" ")
        word_list.append(temp)

print(word_list)

[['Determining', 'online', 'users', 'authorized', 'access', 'digital', 'objects', 'central', 'preserving', 'privacy.', 'This', 'pa-', 'presents', 'design,', 'implementation,', 'deployment', 'Zanzibar,', 'global', 'storing', 'evaluating', 'ac-', 'cess', 'control', 'lists.', 'Zanzibar', 'provides', 'uniform', 'data', 'model', 'configuration', 'language', 'expressing', 'wide', 'range', 'access', 'control', 'policies', 'hundreds', 'client', 'services', 'Google,', 'including', 'Calendar,', 'Cloud,', 'Drive,', 'Maps,', 'Photos,', 'YouTube.', 'Its', 'authorization', 'decisions', 'respect', 'causal', 'or-', 'dering', 'user', 'actions', 'provide', 'external', 'consistency', 'amid', 'changes', 'access', 'control', 'lists', 'object', 'contents.', 'Zanzibar', 'scales', 'trillions', 'access', 'control', 'lists', 'millions', 'authorization', 'requests', 'second', 'support', 'services', 'billions', 'people.', 'It', 'maintained', '95th-percentile', 'la-', 'tency', '10', 'milliseconds', 'availability',

In [None]:
# word_dict= {}
# for paragraph in paragraphs:
#     if paragraph != '':
#         for i in paragraph:
#             print(i)
#             # word_dict[StemmingHelper.stem(i)] = StemmingHelper.original_form(StemmingHelper.stem(i))

# print(word_dict)

In [14]:
from gensim.models import Word2Vec
min_count = 3
size = 100
window = 4
 
model = Word2Vec(word_list, min_count=min_count, vector_size=size, window=window)

In [15]:
key_terms = list(model.wv.index_to_key)
key_terms

[',',
 'Zanzibar',
 '.',
 'ACL',
 'relation',
 'object',
 'checks',
 'The',
 'data',
 'check',
 'clients',
 '⟨',
 '⟩',
 '(',
 ')',
 'ACLs',
 'read',
 'requests',
 'tuples',
 'In',
 'tuple',
 'user',
 'access',
 'A',
 'timestamp',
 'We',
 'reads',
 'client',
 'control',
 'T',
 'snapshot',
 'userset',
 'zookie',
 '2',
 'distributed',
 'To',
 'hot',
 'It',
 'consistency',
 'server',
 'allows',
 'request',
 'content',
 'Zanzibar’s',
 'cache',
 'For',
 'latency',
 'This',
 're-',
 '0',
 'authorization',
 'set',
 'updates',
 'namespace',
 'group',
 'use',
 'index',
 'Spanner',
 'new',
 'write',
 'timestamps',
 'users',
 '#',
 'U',
 'number',
 'servers',
 '•',
 'Leopard',
 'database',
 'lock',
 'evaluated',
 '3',
 'staleness',
 'example,',
 'provide',
 'ID',
 'multiple',
 'update',
 'S',
 'Check',
 'M.,',
 'membership',
 'L',
 'global',
 'storage',
 'groups',
 'evaluate',
 'Each',
 'ordering',
 'ing',
 'Figure',
 'Expand',
 'given',
 'spots',
 'availability',
 'B',
 'external',
 'allow',
 'in

In [16]:
len(key_terms)

607

In [19]:
#Needs to be run 3 times for some weird reason

for i in key_terms:
    if len(i)<3:
        key_terms.remove(i)

print(len(key_terms))

key_terms

546


['Zanzibar',
 'ACL',
 'relation',
 'object',
 'checks',
 'The',
 'data',
 'check',
 'clients',
 'ACLs',
 'read',
 'requests',
 'tuples',
 'tuple',
 'user',
 'access',
 'timestamp',
 'reads',
 'client',
 'control',
 'snapshot',
 'userset',
 'zookie',
 'distributed',
 'hot',
 'consistency',
 'server',
 'allows',
 'request',
 'content',
 'Zanzibar’s',
 'cache',
 'For',
 'latency',
 'This',
 're-',
 'authorization',
 'set',
 'updates',
 'namespace',
 'group',
 'use',
 'index',
 'Spanner',
 'new',
 'write',
 'timestamps',
 'users',
 'number',
 'servers',
 'Leopard',
 'database',
 'lock',
 'evaluated',
 'staleness',
 'example,',
 'provide',
 'multiple',
 'update',
 'Check',
 'M.,',
 'membership',
 'global',
 'storage',
 'groups',
 'evaluate',
 'Each',
 'ordering',
 'ing',
 'Figure',
 'Expand',
 'given',
 'spots',
 'availability',
 'external',
 'allow',
 'including',
 'single',
 'different',
 'evaluation',
 'Accessed:',
 '2019-04-16.',
 'key',
 'A.,',
 'support',
 'RPCs',
 'system.',
 'requir

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


class KWE:
    text="""Determining whether online users are authorized to access digital objects is central to preserving privacy. This pa- per presents the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- cess control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. Zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. It has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use."""

    def _init_(self,t):
        text=t

    def keywordExtract(self):
        n_gram_range = (1, 1)
        stop_words = "english"
        count = CountVectorizer(ngram_range=n_gram_range,
                                stop_words=stop_words).fit([self.text])

        candidates = count.get_feature_names_out()

        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        doc_embedding = model.encode([self.text])
        candidate_embeddings = model.encode(candidates)

        top_n =10
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0]][-top_n:]

        print(keywords)

        print(type(keywords))

        s = ",".join(keywords)

        print(s)
        return s
    # keywrdextract(data)
# if _name_ == "_main_":
c = KWE()
c.keywordExtract()

['photos', 'client', 'users', 'trillions', 'privacy', 'global', 'zanzibar', 'online', 'youtube', 'google']
<class 'list'>
photos,client,users,trillions,privacy,global,zanzibar,online,youtube,google


'photos,client,users,trillions,privacy,global,zanzibar,online,youtube,google'