In [None]:
!pip install -q weaviate-client
!pip install -q PyPDF2
!pip install -q openai
!pip install -q keybert
!pip install -q keybert[flair]
!pip install -q keybert[gensim]
!pip install -q keybert[spacy]
!pip install -q keybert[use]
!pip install -q git+http://github.com/LIAAD/yake
!pip install -q langchain

In [None]:
from pprint import pprint

## Data (examples)

### Functions

In [None]:
doc1 = '1706.03762.pdf'
doc2 = ''

In [None]:
from PyPDF2 import PdfReader
def read_pdf(file_path: str) -> str:
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text


def chunking(some_text: str) -> list[str]:
  from langchain.text_splitter import RecursiveCharacterTextSplitter

  r_splitter = RecursiveCharacterTextSplitter(
      chunk_size=150,
      chunk_overlap=0,
      separators=["\n\n", "\n", " ", ""]
  )
  return r_splitter.split_text(some_text)


def chunking_md(some_text: str) -> list[str]:
  from langchain.text_splitter import MarkdownHeaderTextSplitter


  headers_to_split_on = [
      ("#", "Header 1"),
      ("##", "Header 2"),
      ("###", "Header 3"),
  ]

  markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
  md_header_splits = markdown_splitter.split_text(some_text)

  return md_header_splits

def extract_kws_bert(doc: str) -> list[str]:
  from keybert import KeyBERT
  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(doc)
  return [x[0] for x in keywords]

def extract_kws(doc: str) -> list[str]:
  import yake
  custom_kw_extractor = yake.KeywordExtractor(lan="en")
  keywords = custom_kw_extractor.extract_keywords(doc)
  return [x[0] for x in keywords]

### Queries

In [None]:
query_1 = 'Which are some popular libraries of bioinformatics ?'

query_2 = 'Which applications can I develop using Reinforcement Learning?'

query_3 = 'Where is CapitaLand Investment Limited incorporated ?'

query_4 = 'Can you make a 30 words summary of the Message to Shareholders?'

### Regulations

In [None]:
doc_1 = read_pdf(doc1)


In [None]:
doc_1 = doc_1[:int(len(doc_1)*0.1)]

In [None]:
documents = [doc_1]

## Vector DB

### Client

In [None]:
import os
from typing import Any

import weaviate
from google.colab import userdata
from weaviate.client import WeaviateClient


class WeaviateClientSession:
    def __init__(self, version: str = '1.23.7') -> None:
        self.version = version
        self.client = weaviate.connect_to_embedded(
            version=self.version,
            headers={
                "X-OpenAI-Api-Key": userdata.get('OPENAI_API_KEY'),  # os.getenv('OPENAI_API_KEY'),
            },
            persistence_data_path='./db',
        )

    def __enter__(self) -> WeaviateClient:
        self.client.connect()
        return self.client

    def __exit__(self, exc_type: type, exc_value: Exception, traceback: Any) -> None:
        self.client.close()

### Create collection

In [None]:
import weaviate.classes as wvc


with WeaviateClientSession() as client:
  client.collections.create(
        name='Documents',
        properties=[
            wvc.config.Property(
                name='document_id',
                data_type=wvc.config.DataType.INT,
                skip_vectorization=True,
            ),
            wvc.config.Property(
                name='chunk_text',
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=False,
            ),
            wvc.config.Property(
                name='chunk_index',
                data_type=wvc.config.DataType.INT,
                skip_vectorization=True,
            ),
            wvc.config.Property(
                name='keywords',
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=True,
                tokenization=wvc.config.Tokenization.WHITESPACE,
            ),
        ],
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai('text-embedding-3-small'),
        generative_config=wvc.config.Configure.Generative.openai(model='gpt-3.5-turbo'),
    )

In [None]:
with WeaviateClientSession() as client:
  pprint(client.collections.list_all())

### Store document chunks and keywords

In [None]:
def prepare_doc_objs(docs: list[str]) -> list[dict]:
  doc_objs = []

  for doc_id, doc in enumerate(docs):
    # Chunk documents
    chunks: list[str] = chunking(doc)

    # Retrieve document keywords using GPT
    doc_keywords: list[str] = extract_kws(doc)

    for idx, chunk in enumerate(chunks):
      # Retrieve chunk keywords using GPT
      chunk_keywords: list[str] = extract_kws(chunk)

      doc_objs.append({
        'document_id': doc_id,
        'chunk_text': chunk,
        'chunk_index': idx,
        'keywords': ' '.join([*doc_keywords, *chunk_keywords]),
      })

  return doc_objs

In [None]:
with WeaviateClientSession() as client:
  client.connect()
  doc_objs = prepare_doc_objs(documents)
  documents_coll = client.collections.get("Documents")
  documents_coll.data.insert_many(doc_objs)

## Perform hybrid search

In [None]:
def perform_search(query: str, n: int = 5, alpha: float = 0.25) -> list[dict]:
  with WeaviateClientSession() as client:
      documents_coll = client.collections.get("Documents")
      response = documents_coll.query.hybrid(
          query=query,
          query_properties=["keywords"],
          # fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE, # we need to use the score fusion (not rank) for autocut
          # auto_limit=1, # we only want one jump
          alpha=alpha,
          limit=n,
      )

      for o in response.objects:
          print(o.properties)

In [None]:
perform_search(query_2)