### Getting text from multi-column pdf

In [1]:
!pip3 install sentence-transformers
!pip3 install networkx



In [2]:
!pip3 install nltk



In [3]:
!pip3 install pymupdf



In [4]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/akshat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/akshat/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/akshat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/akshat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
import fitz
DIGITIZED_FILE = "Psychology.pdf"

In [6]:
from operator import itemgetter


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [7]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [8]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"

                header_para.append(block_string)

    return header_para

In [9]:
doc = fitz.open(DIGITIZED_FILE)
font_style, styles = fonts(doc)
size_tag = font_tags(font_style, styles)
headers_para = headers_para(doc, size_tag)

In [10]:
headings = []
paragraphs = []
for text in headers_para:
    if text.startswith("<h"):
        headings.append(text[3:])
    elif text.startswith("<s"):
        pass
    else:
        if text.endswith("|"):
            paragraphs.append(text[3:-1])
        else:
            paragraphs.append(text[3:])

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
wnl = WordNetLemmatizer()

 
class LemmatizerHelper(object):
    """
    Class to aid the lemmatization process - from word to stemmed form,
    and vice versa.
    The 'original' form of a lemmatized word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the lemmatized
    #words
    word_lookup = {}
 
    @classmethod
    def lemmatize(cls, sentence):
        """
        Lemmatize a sentence and updates the reverse lookup.
        """
 
        #Lemmatize the word
        lemmatized = cls.lemmatize_sentence(sentence)
 
        return lemmatized

    @classmethod
    def nltk_pos_tagger(cls, nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None

    @classmethod
    def lemmatize_sentence(cls, sentence):

        nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
        wordnet_tagged = map(lambda x: (x[0], cls.nltk_pos_tagger(x[1])), nltk_tagged)
        lemmatized_sentence = []
        
        for word, tag in wordnet_tagged:
            if tag is None:
                lemmatized_sentence.append(word)
            else:        
                lemmatized_sentence.append(wnl.lemmatize(word, tag))
        return " ".join(lemmatized_sentence)


In [12]:
lemmatized_paras = []
for i in range(len(paragraphs)):
    if paragraphs[i] != "":
        lemmatized_paras.append(LemmatizerHelper.lemmatize(paragraphs[i]))
print(lemmatized_paras)

['Rotman Research Institute at Baycrest , Toronto , Ontario , Canada', 'In real-life decision situation we be often face with alterna- tives that seem so equivalent that choice be extremely difficult . Under such circumstance our final selection may feel like an arbitrary choice , although in fact there may be implicit influence act outside conscious control that bias us toward select one alternative over another . The observation that people can make correct choice while believe that they be select randomly have a long history in experimental psychology . Studies date from the 19th century have consistently find that participant can make subtle perceptual discrimination judgment with above- chance accuracy despite claim that they be simply guess ( Adams , 1957 ; Voss & Paller , 2010 ) . Voss and colleague have recently provide evidence for a similar effect in recognition memory ( Voss , Baym & Paller , 2008 ) . Participants study a series of kaleidoscope image and then attempt to reco

In [13]:
# from gensim.parsing.preprocessing import remove_stopwords
# word_list = []
# for paragraph in stemmed_paras:
#     if paragraph != '':
#         filtered_sentence = remove_stopwords(paragraph)
#         temp = filtered_sentence.split(" ")
#         word_list.append(temp)


In [14]:
# from gensim.models import Word2Vec
# min_count = 3
# size = 100
# window = 4
 
# model = Word2Vec(word_list, min_count=min_count, vector_size=size, window=window)

In [15]:
# key_terms = list(model.wv.index_to_key)
# key_terms

In [16]:
# new_key_terms = []
# for i in key_terms:
#     if len(i) >= 3:
#         new_key_terms.append(i)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


class KWE:
    # text="""Determining whether online users are authorized to access digital objects is central to preserving privacy. This pa- per presents the design, implementation, and deployment of Zanzibar, a global system for storing and evaluating ac- cess control lists. Zanzibar provides a uniform data model and configuration language for expressing a wide range of access control policies from hundreds of client services at Google, including Calendar, Cloud, Drive, Maps, Photos, and YouTube. Its authorization decisions respect causal or- dering of user actions and thus provide external consistency amid changes to access control lists and object contents. Zanzibar scales to trillions of access control lists and millions of authorization requests per second to support services used by billions of people. It has maintained 95th-percentile la- tency of less than 10 milliseconds and availability of greater than 99.999% over 3 years of production use."""

    def __init__(self,t):
        self.text=t

    def keywordExtract(self):
        n_gram_range = (1, 1)
        stop_words = "english"
        count = CountVectorizer(ngram_range=n_gram_range,
                                stop_words=stop_words).fit(self.text)

        candidates = count.get_feature_names_out()
        model = SentenceTransformer('all-mpnet-base-v2')
        doc_embedding = model.encode(self.text)
        candidate_embeddings = model.encode(candidates)

        top_n = 25
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0]][-top_n:]

        return keywords, candidates, model
    # keywrdextract(data)
# if _name_ == "_main_":
c = KWE(lemmatized_paras)
keywords, candidates, model = c.keywordExtract()

In [29]:
print(keywords)

['address', 'phenom', 'experimental', 'studies', 'target', 'authors', 'mechanism', 'publication', 'ac', 'stem', 'central', 'paller', 'chamberland', 'institute', 'generated', 'rho', 'publications', 'experiments', 'research', 'researcher', 'canada', 'baycrest', 'ontario', 'toronto', 'rotman']


In [30]:
from sklearn.metrics.pairwise import cosine_similarity
from networkx import Graph

def build_mind_map(nodes, model, candidates, root="experimental", alpha=0.2):
    g = Graph()

    for node in nodes:
        if node not in candidates:
            raise ValueError(f"Node {node} not in candidates")
    if root not in nodes:
        raise ValueError(f"Root {root} not in nodes")

    unvisited_nodes = set(nodes)

    visited_nodes = set([])

    visited_node_vectors = {}

    node_distances = {}


    current_node = root
    visited_node_vectors[root]=model.encode([root])
    unvisited_nodes.remove(root)
    visited_nodes.add(root)

    for i in range(1, len(nodes)):
        for x in unvisited_nodes.copy():
            dist_from_current = cosine_similarity(visited_node_vectors[root], model.encode([x]))
            distance = node_distances.get(x, (100, ''))
            if distance[0] > dist_from_current:
                node_distances[x] = (dist_from_current, current_node)
            
            next_node = min(unvisited_nodes, key=lambda u: node_distances[x][0])

            if next_node in node_distances.keys():
                parent = node_distances[next_node][1]
            else: 
                node_distances[next_node] = (cosine_similarity(visited_node_vectors[root], model.encode([next_node])), current_node)
                parent = node_distances[next_node][1]
            del node_distances[next_node]
            next_node_vec = ((1 - alpha) * model.encode([next_node]) + alpha * visited_node_vectors[parent])

            visited_node_vectors[next_node] = next_node_vec
            unvisited_nodes.remove(next_node)

            visited_nodes.add(next_node)

            g.add_edge(parent, next_node)

            current_node = next_node
    
    return g


mind_map = build_mind_map(nodes=keywords, model=model, candidates=candidates)

In [31]:
print(mind_map)

Graph with 25 nodes and 24 edges


In [33]:
import networkx as nx
Agraph = nx.nx_agraph.to_agraph(mind_map)
Agraph.layout(prog="dot")
Agraph.draw("file.png")