In [3]:
# import block
import os
from pathlib import Path
from semantica.db.db_tools import SemanticaDB
from semantica.utils.summarize import SemanticaSummarizer
from semantica.utils.parser import parse_files
from semantica.utils.preprocessing import preprocess
import gradio as gr

In [4]:
# instantiate database and summarizer

summarizer = SemanticaSummarizer()
db = SemanticaDB("semanticadb")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [6]:
# parse pdf files into txt
try:
    pdf_path = Path(os.path.dirname('')) / "data/pdf_raw"
    txt_path = pdf_path.parent / "extracted"
except FileNotFoundError:
    print("Base folder not found")
if not txt_path.exists():
    parse_files(base_folder=str(pdf_path),
                dst_folder=str(txt_path), num_workers=0)
    print("parsing done.")

extracted F6.pdf
extracted F0.pdf
extracted F9.pdf
extracted F3.pdf
extracted F2.pdf
extracted F8.pdf
extracted F7.pdf
extracted F1.pdf
extracted F5.pdf
parsing done.


In [7]:
# add data to the collection
names = os.listdir(txt_path)
files = [txt_path / name for name in names]
metadata = [{"file_path": str(file)} for file in files]
sequences = []
for file in files:
    sequences.append(preprocess(file_path=file))
print(f"number of files: {len(sequences)}")
for id, seq, met in zip(names, sequences, metadata):
    db.add_data(id[:-4], seq, met)
print("number of files in db:", db.collection.count())

number of files: 9
File F6 added to database.
File F7 added to database.
File F5 added to database.
File F0 added to database.
File F1 added to database.
File F3 added to database.
File F2 added to database.
File F9 added to database.
File F8 added to database.
number of files in db: 9


In [10]:
QUERY = "what is the wage for broadcast technicians?"


def search_interface(query):
    ids, paths = db.get_files(query, 2)
    summaries = summarizer.summarize_files(paths)
    texts = ""
    for id, path, summary in zip(ids, paths, summaries):
        texts = texts + f"File: {id}.txt\nPath: {path}\n{summary}\n\n"
    return texts


interface = gr.Interface(fn=search_interface, inputs="text",
                         outputs="text", allow_flagging=False)
interface.launch(share=True, inline=True)



Running on local URL:  http://127.0.0.1:7862


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://91ff0f48e33c0b329c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


