# pipelines.extractor

> Pipline for information extraction

In [None]:
# | default_exp pipelines.extractor

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export

import os
from typing import Any, Dict, Generator, List, Optional, Tuple, Union, Callable
import pandas as pd
from onprem.utils import segment

from onprem.ingest import load_single_document


class Extractor:
    def __init__(
        self,
        llm,
        prompt_template: Optional[str] = None,              
        **kwargs,
    ):
        """
        `Extractor` applies a given prompt to each sentence or paragraph in a document and returns the results.

        **Args:**

        - *llm*: An `onprem.LLM` object
        - *prompt_template*: A model specific prompt_template with a single placeholder named "{prompt}".
                             All prompts (e.g., Map-Reduce prompts) are wrapped within this prompt.
                             If supplied, overrides the `prompt_template` supplied to the `LLM` constructor.

        """
        self.llm = llm
        self.prompt_template = prompt_template if prompt_template is not None else llm.prompt_template



    def apply(self,
              ex_prompt_template:str, 
              fpath: Optional[str] = None,
              content: Optional[str] = None,
              unit:str='paragraph',
              filter_fn: Optional[Callable] = None,
              clean_fn: Optional[Callable] = None,
              pdf_pages:List[int]=[],
              maxchars = 2048,
              stop:list=[]
             ):
        """
        Apply the prompt to each `unit` (where a "unit" is either a paragraph or sentence) optionally filtered by `filter_fn`.
        Results are stored in a `pandas.Dataframe`.


        **Args:**

        - *ex_prompt_template*: A prompt to apply to each `unit` in document. Should have a single variable, `{text}`.
                               Example: `"Extract universities from the following text delimited by ###:\n\n###{text}###"`
        - *fpath*: A path to to a single file of interest (e.g., a PDF or MS Word document). Mutually-exclusive with `content`.
        - *content*: Text content of a document of interest.  Mutually-exclusive with `fpath`.
        - *unit*: One of {'sentence', 'paragraph'}. 
        - *filter_fn*: A function that accepts a sentence or paragraph and returns `True` if prompt should be applied to it.
                       If `filter_fn` returns False, the text is ignored and excluded from results.
        - *clean_fn*: A function that accepts a sentence or paragraph and returns "cleaned" version of the text.
                      If `filter_fn` exists, only applied to texts for which `filter_fn` returns True.
        - *pdf_pages*: If `fpath` is a PDF document, only apply prompt to text on page numbers listed in `pdf_pages`.
                       Page numbers start with 1, not 0 (e.g., `pdf_pages=[1,2,3]` for first three pages).
                       If list is empty, prompt is applied to every page.
        - *maxchars*: units (i.e., paragraphs or sentences) larger than `maxhcars` split.
        - *stop*: list of characters to trigger the LLM to stop generating.




        **Returns:**

        - pd.Dataframe: a Dataframe with results
        """
        if not(bool(fpath) != bool(content)):
            raise ValueError('Either fpath argument or content argument must be supplied but not both.')
            
        # setup extraction prompt
        extraction_prompt = ex_prompt_template if self.prompt_template is None else self.prompt_template.format(**{'prompt': ex_prompt_template})   

        # extract text
        if not content:
            if not os.path.isfile(fpath):
                raise ValueError(f'{fpath} is not a file')
            docs = load_single_document(fpath)
            ext = "." + fpath.rsplit(".", 1)[-1].lower()
            if ext == '.pdf' and pdf_pages:
                docs = [doc for i,doc in enumerate(docs) if i+1 in pdf_pages]
            content = '\n\n'.join([doc.page_content for doc in docs])
        
        # segment
        chunks = segment(content, maxchars=maxchars, unit=unit)
        extractions = []
        texts = []
        for chunk in chunks:
            if filter_fn and not filter_fn(chunk): continue
            if clean_fn: chunk = clean_fn(chunk)
            prompt = extraction_prompt.format(text=chunk)
            extractions.append(self.llm.prompt(prompt, stop=stop))
            texts.append(chunk)
        df = pd.DataFrame({'Extractions':extractions, 'Texts':texts})
        return df
            
        return results

In [None]:
show_doc(Extractor.apply)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/extractor.py#L39){target="_blank" style="float:right; font-size:smaller"}

### Extractor.apply

>      Extractor.apply (ex_prompt_template:str, fpath:Optional[str]=None,
>                       content:Optional[str]=None, unit:str='paragraph',
>                       filter_fn:Optional[Callable]=None,
>                       pdf_pages:List[int]=[], maxchars=2048, stop:list=[])

*Apply the prompt to each `unit` (where a "unit" is either a paragraph or sentence) optionally filtered by `filter_fn`.
Results are stored in a `pandas.Dataframe`.

        **Args:**

        - *ex_prompt_template*: A prompt to apply to each `unit` in document. Should have a single variable, `{text}`.
                               Example: `"Extract universities from the following text delimited by ###:

###{text}###"`
        - *fpath*: A path to to a single file of interest (e.g., a PDF or MS Word document). Mutually-exclusive with `content`.
        - *content*: Text content of a document of interest.  Mutually-exclusive with `fpath`.
        - *unit*: One of {'sentence', 'paragraph'}. 
        - *filter_fn*: A function that accepts a sentence or paragraph and returns `True` if prompt should be applied to it.
                       If `filter_fn` returns False, the text is ignored and excluded from results.
        - *pdf_pages*: If `fpath` is a PDF document, only apply prompt to text on page numbers listed in `pdf_pages`.
                       Page numbers start with 1, not 0 (e.g., `pdf_pages=[1,2,3]` for first three pages).
                       If list is empty, prompt is applied to every page.
        - *maxchars*: units (i.e., paragraphs or sentences) larger than `maxhcars` split.
        - *stop*: list of characters to trigger the LLM to stop generating.

        **Returns:**

        - pd.Dataframe: a Dataframe with results*

In [None]:
# | notest

from onprem import LLM
from onprem.pipelines import Extractor

In [None]:
# | notest

prompt_template = "<s>[INST] {prompt} [/INST]" # prompt template for Mistral
llm = LLM(model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf', 
          n_gpu_layers=33,  # change based on your system
          verbose=False, mute_stream=True, 
          prompt_template=prompt_template)
extractor = Extractor(llm)



In [None]:
# | notest

prompt = """Extract citations from the following sentences. Return #NA# if there are no citations in the text. Here are some examples:

[SENTENCE]:pretrained BERT text classifier (Devlin et al., 2018), models for sequence tagging (Lample et al., 2016)
[CITATIONS]:(Devlin et al., 2018), (Lample et al., 2016)
[SENTENCE]:Machine learning (ML) is a powerful tool.
[CITATIONS]:#NA#
[SENTENCE]:Following inspiration from a blog post by Rachel Thomas of fast.ai (Howard and Gugger, 2020), we refer to this as Augmented Machine Learning or AugML
[CITATIONS]:(Howard and Gugger, 2020)
[SENTENCE]:{text}
[CITATIONS]:"""

In [None]:
# | notest

content = """
For instance, the fit_onecycle method employs a 1cycle policy (Smith, 2018). 
"""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('(Smith, 2018)')

In [None]:
# | notest

content ="""In the case of text, this may involve language-specific preprocessing (e.g., tokenization)."""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('#NA#')

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()