<a href="https://colab.research.google.com/github/aflores/colab-notebooks/blob/master/kor_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with the KOR llm library

In [None]:
%%writefile requirements.txt
pdfminer.six
openai
langchain
kor


Overwriting requirements.txt


In [None]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import json
data_path = './drive/MyDrive/Colab Data/kor/'

keys_file = f"{data_path}keys.json"
with open(keys_file) as f:
   keys = json.load(f)

os.environ["OPENAI_API_KEY"] = keys['OPENAI_API_KEY']

In [None]:
#import extract_text module from PDF Miner and extract text
from pdfminer.high_level import extract_text
text = extract_text(f'{data_path}Airline Commission Agreement.pdf')

#perform basic processing to remove \n
processed_text = " ".join(text.split("\n"))
print(processed_text)

Airline Commission Agreement   Agreement Id  Airline  Agency   CCC123  AA - American Airline  AG001 - Fantastic Voyages    Booking Dates  Flight Dates   01JUL2023  01JUL2023   15DEC2023  31DEC2023   Origin  MIA, FLL  MIA, FLL   Des:na:on  SFO, LAX, SAN  SFO, LAX, SAN   Class  B, E, G  A, C, F   Commission  5%  8%         


In [None]:
#import langChain ChatOpenAI module
from langchain.chat_models import ChatOpenAI

#load GPT 3.5 model
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000
)

# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

In [None]:
# kor schemas

airline_schema = Object(
    id="airline",
    description="airline code and name",
    attributes=[
        Text(id="code"),
        Text(id="name")
    ],
    examples=[
        ("Airline CC - airline name",
         {
            "code": "CC",
            "name": "airline name"
         },
        )
    ],
    many=False
)

agency_schema = Object(
    id="agency",
    description="agency code and name",
    attributes=[
        Text(id="code"),
        Text(id="name")
    ],
    examples=[
        ("Agency.  code - agency name",
         {
            "code": "code",
            "name": "agency name"
         },
        )
    ],
    many=False
)

commission_schema = Object(
    id="commission",
    description="commission rules",
    attributes=[
        Text(id="origin"),
        Text(id="destination"),
        Text(id="class"),
        Text(id='percent')
    ],
    examples=[
        ("ooo ddd ccc ppp%",
         {
            "origin": "ooo",
            "destination": "ddd",
            "class": "ccc",
            "percent": "ppp"
         },
        )
    ],
    many=True
)


id_schema = Object(
    id="id",
    description="descr",
    attributes=[
            Text(
            id="agreement_id",
            description= "unique identifier for this agreement",
        examples=[
            ("agreement id xxxx", "xxxx"),
            ("agreement abc123", "abc123")
        ])
    ],
    many=False
)

## agreement schema
agreement_schema = Object(
    id="header",
    description="agreement header information",
    attributes=[
        id_schema,
        airline_schema,
        agency_schema,
        commission_schema
    ],
    many=False,
)


In [None]:
header_chain = create_extraction_chain(llm
    ,agreement_schema
    ,encoder_or_encoder_class="json"
    ,input_formatter=None)
header_chain.predict_and_parse(text=processed_text)["data"]


{'header': {'id': {'agreement_id': 'CCC123'},
  'airline': {'code': 'AA', 'name': 'American Airline'},
  'agency': {'code': 'AG001', 'name': 'Fantastic Voyages'},
  'commission': [{'origin': 'MIA',
    'destination': 'SFO',
    'class': 'B',
    'percent': '5'},
   {'origin': 'MIA', 'destination': 'LAX', 'class': 'E', 'percent': '5'},
   {'origin': 'MIA', 'destination': 'SAN', 'class': 'G', 'percent': '5'},
   {'origin': 'FLL', 'destination': 'SFO', 'class': 'A', 'percent': '8'},
   {'origin': 'FLL', 'destination': 'LAX', 'class': 'C', 'percent': '8'},
   {'origin': 'FLL', 'destination': 'SAN', 'class': 'F', 'percent': '8'}]}}