In [None]:
# TextFind demo on importing pdf documents and doing semantic searches and question answering

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Basic class used by all the extractors to accomodate the extracted text for saving into TextFind

import abc


class BaseExtractor:
    def __init__(self, text_chunk_max_size):
        self.text_chunk_max_size = text_chunk_max_size

    @abc.abstractmethod
    def page_text_provider(self):
        pass

    @abc.abstractmethod
    def extract_text(self, text_provider):
        pass

    def extract_text_chunks(self):
        text_chunks_arr = []
        current_text_chunk = ''
        for text_provider in self.page_text_provider():
            text_gen = self.extract_text(text_provider)
            current_text_chunk += text_gen
            if len(current_text_chunk) > self.text_chunk_max_size:
                text_chunks_arr.append(current_text_chunk)
                current_text_chunk = ''
            else:
                current_text_chunk += text_gen
        if len(current_text_chunk) > 0:
            text_chunks_arr.append(current_text_chunk)
        return text_chunks_arr



In [None]:
# Simple Pdf text extractor. Not that fancy but it does its job

from pdfminer.layout import LTFigure
from pdfminer.layout import LTTextBox
from pdfminer.layout import LTChar
from pdfminer.layout import LTTextLine
from pdfminer.high_level import extract_pages


class PdfExtractor(BaseExtractor):

    def page_text_provider(self):
        return extract_pages(self.pdf_file)

    def extract_text(self, layout):
        layout_text = ''
        for lobj in layout:
            layout_text += PdfExtractor.__text(lobj)
        return layout_text

    def __init__(self, pdf_file, text_chunk_max_size):
        BaseExtractor.__init__(self, text_chunk_max_size=text_chunk_max_size)
        self.pdf_file = pdf_file

    def page_text_provider(self):
        return extract_pages(self.pdf_file)

    @staticmethod
    def __text(lobj):
        lobj_text = ''
        if isinstance(lobj, LTTextBox):
            for element in lobj:
                if isinstance(element, LTTextLine):
                    lobj_text = lobj_text + element.get_text()
        elif isinstance(lobj, LTFigure):
            for element in lobj:
                if isinstance(element, LTChar):
                    lobj_text = lobj_text + element.get_text()
        return lobj_text


In [None]:
# Utitlity method to reject lines smaller than the min_line_length value.
# and to remove some newlines (prone to changes, but for now it does the work)

def text_chunk_arr_to_sentences(text_chunk_arr, min_line_length=10):
    sentences = []
    for text_chunk in text_chunk_arr:
        lines = text_chunk.splitlines()
        sentence = ''
        for line in lines:
            if line.endswith('.'):
                sentence += line
                sentences.append(sentence)
                sentence = ''
            else:
                if len(line) >= min_line_length:
                    sentence += line
    return sentences

In [None]:
# Create a minimal TextFind client to exercise a few TextFind endpoints like:
# creating documents, uploading files, querying some basic metadata (channel and channel properties)

import logging
import requests
import json
import copy
import os
import mimetypes
import uuid
from datetime import datetime


class TextFindClient:

    API_KEY = "api-key"

    FILE_DOCUMENT_CHANNEL_NAME = "__file_documents__"

    FILE_PAGE_DOCUMENTS_CHANNEL_NAME = "__file_page_documents__"

    def __init__(self, url, app_key=None, verify_ssl_cert=True):
        self.app_key = app_key
        self.url = url
        self.verify_ssl_cert = verify_ssl_cert

    def create_document(self, topic_uuid, channel_uuid, document, shared_with_group_uuids=[], file_object_ids=[]):
        document_context = copy.deepcopy(document)
        if len(file_object_ids) > 0:
            document_context["relatedFileObjectIDs"] = ','.join(file_object_ids)
        else:
            document_context["relatedFileObjectIDs"] = None
        document = dict()
        document['topicUuid'] = topic_uuid
        document['channelUuid'] = channel_uuid
        document['sharedWithGroupUuids'] = shared_with_group_uuids
        document['context'] = document_context
        response = requests.post(url=f'{self.url}/sas-api/documents/text-documents',
                                 json=document, headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        if not response.ok:
            logging.error(response.text)
            raise Exception(response.text)

    @staticmethod
    def __mimetype(file_path):
        return mimetypes.guess_type(file_path)

    @staticmethod
    def __basename(file_path):
        return os.path.basename(file_path)

    def upload_file(self, file_path):
        basename = TextFindClient.__basename(file_path)
        mt = TextFindClient.__mimetype(file_path)

        files = {'file': (basename, open(file_path, 'rb'), mt)}
        response = requests.post(f'{self.url}/sas-api/file-manager/file',
                                 files=files,
                                 headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        if not response.ok:
            logging.error(response.text)
            raise Exception(response.text)
        else:
            is_success = json.loads(response.text)["success"]
            if is_success:
                return json.loads(response.text)["fileObjectId"]
            else:
                logging.debug(json.loads(response.text)["errorMessage"])
                raise Exception(json.loads(response.text)["errorMessage"])

    def create_document_from_file(self,
                                  file_path,
                                  topic_uuid,
                                  channel_user_id,
                                  channel_user_name,
                                  title=None, max_page_size=10000, max_title_size=100,
                                  text_extractors_map=None,
                                  upload_file=False):
        file_doc_channel_uuid = self.get_file_documents_channel()['uuid']
        file_object_ids = []
        if upload_file:
            file_object_id = self.upload_file(file_path)
            file_object_ids = [file_object_id]
        mt, _ = TextFindClient.__mimetype(file_path)
        if text_extractors_map is None:
            if mt == "application/pdf":
                pdf_text_extractor = PdfExtractor(file_path, text_chunk_max_size=max_page_size)
            else:
                raise Exception(f"There is no default text extractor with the [{mt}] extension"
                                f" for the provided file {file_path}")
        else:
            raise Exception(f"Please provide a text extractor for files with the {mt} extension.")
        text_arr = pdf_text_extractor.extract_text_chunks()
        if title is None and len(text_arr[0]) > 0:
            first_text_line = text_arr[0]
            if len(first_text_line) > max_title_size:
                title = first_text_line[0: max_title_size]
            else:
                title = first_text_line

        file_document_uuid = str(uuid.uuid1())
        now = datetime.now()
        current_time = now.strftime("%H:%M")
        post_date = now.strftime("%Y-%d-%m")

        self.create_document(topic_uuid=topic_uuid,
                             channel_uuid=file_doc_channel_uuid,
                             document={
                                 "channel_user_id": channel_user_id,
                                 "channel_user_name": channel_user_name,
                                 "file_document_uuid": file_document_uuid,
                                 "title": f"{title}",
                                 "text": f"title: {title}\n"
                                         f"This is a file document with file_document_uuid equals to "
                                         f"{file_document_uuid}\n",
                                 "url": "n/a",
                                 "post_time": current_time,
                                 "post_date": post_date,
                             },
                             shared_with_group_uuids=[],
                             file_object_ids=file_object_ids)
        self.create_file_page_document(channel_user_id, channel_user_name, current_time, file_document_uuid,
                                       post_date, text_arr, title, topic_uuid)

    def create_file_page_document(self, channel_user_id, channel_user_name, current_time, file_document_uuid,
                                  post_date, text_arr, title, topic_uuid):
        file_page_channel_uuid = self.get_file_page_documents_channel()['uuid']
        page_number = 1
        for text in text_arr:
            self.create_document(topic_uuid=topic_uuid,
                                 channel_uuid=file_page_channel_uuid,
                                 document={
                                     "channel_user_id": channel_user_id,
                                     "channel_user_name": channel_user_name,
                                     "file_document_uuid": file_document_uuid,
                                     "title": f"{title} - page {page_number}",
                                     "text": text,
                                     "url": "n/a",
                                     "post_time": current_time,
                                     "post_date": post_date,
                                     "page_number": str(page_number)
                                 },
                                 shared_with_group_uuids=[],
                                 file_object_ids=[])
            page_number = page_number + 1

    def get_file_documents_channel(self):
        get_request = requests.get(f'{self.url}/sas-api/metadata/channels',
                                   headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        channels = json.loads(get_request.text)
        channels = list(filter(lambda channel: (channel['name'] == self.FILE_DOCUMENT_CHANNEL_NAME), channels))
        return channels[0]

    def get_file_page_documents_channel(self):
        get_request = requests.get(f'{self.url}/sas-api/metadata/channels',
                                   headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        channels = json.loads(get_request.text)
        channels = list(filter(lambda channel: (channel['name'] == self.FILE_PAGE_DOCUMENTS_CHANNEL_NAME), channels))
        return channels[0]

    def search_semantic(self, q):
        get_request = requests.get(f'{self.url}/sas-api/semantic-search', params={'q': q},
                                   headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        searh_results = json.loads(get_request.text)
        return searh_results["response"]

    def qa(self, q):
        get_request = requests.get(f'{self.url}/sas-api/qa', params={'q': q},
                                   headers={self.API_KEY: self.app_key}, verify=self.verify_ssl_cert)
        searh_results = json.loads(get_request.text)
        return searh_results


In [None]:
url = 'https://192.168.1.48:31854'
api_key = '6arMPG04MzuFq0U9HNvh0uTXIjF5ANPl'
pdf_files_dir = '/home/nicolae/caralislabs/import_files_to_textfind/pdf_files/'

In [None]:
from os import listdir
listdir(f"{pdf_files_dir}")


In [None]:
# Create an instance of TextFind client

text_find_client = TextFindClient(url=url, app_key=api_key, verify_ssl_cert=False)

In [None]:
text_find_client.get_file_documents_channel()

In [None]:
# Execute semantic search

resulst = text_find_client.search_semantic(q='Biden administration has been pushing EVs')
resulst['docs']


In [None]:
# Execute question answering

resulst = text_find_client.qa(q='What did Toyota Chairman say with regards to Electric Cars')
'Answer:' + resulst['answer']


In [None]:
resulst['sentence_context']

In [None]:
# Import all the files in TextFind

for file_name in listdir(f"{pdf_files_dir}"):
    print(f"> start importing [{file_name}]")
    text_find_client.create_document_from_file(
        channel_user_id="admin01",
        channel_user_name="admin01",
        title=f"{file_name}",
        file_path=f"{pdf_files_dir}/{file_name}",
        topic_uuid='62f0d33f-b2b5-4e65-a918-33cf152dfae3',
        max_page_size=10000,
        upload_file=True)
    print(f"> importing finished [{file_name}]")
