In [92]:
# Kor!
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# LangChain Models
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

# Standard Helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime

# Text Helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# For token counting
from langchain.callbacks import get_openai_callback

def printOutput(output):
    print(json.dumps(output, indent=3))

In [9]:
# It's better to do this an environment variable but putting it in plain text for clarity
openai_api_key = '------'

In [10]:
llm = ChatOpenAI(
#     model_name="gpt-3.5-turbo", # Cheaper but less reliable
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=openai_api_key
)

### Working With Lists for training Cvent Test Input

You can also extract lists as well.

Note: Check out how I have a nested object. The 'parts' object is in the 'cars_schema'

In [16]:
hardware_parts = Object(
    id="hardware_parts",
    description="Multiple hardwares for onsite event requirements",
    attributes=[
        Text(id="hardwares", description="The name of the part")
    ],
    examples=[
        (
            "Hardwares with Quantities as agreed upon are: iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7",
            [
                [{"item": "iPad 9.7inch", "quantity": 7}, 
                 {"item": "Stylish Aluminum Table Stand", "quantity": 7}]
            ],
        )
    ]
)

event_schema = Object(
    id="event",
    description="Onsite requirements for an event",
    examples=[
        (
            "Event Name is Cvent 2023 Test event,Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST, Cvent Staff is Abhishek Rai, Cvent Staff Number is +123456344, Client Staff is Pranavesh, Client Staff Number is +12344464. Hardwares with Quantities as agreed upon are:iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7",
            [
                {"name": "Cvent 2023 Test event", 
                 "date/time": {"hardline dropped": "06/10/2023 9:00 AM EST", "event ended": "06/20/2023 4:00 PM EST"}, 
                 "staff": {"Cvent": 
                           {"name": "Abhishek Rai", "number": "+123456344"}, 
                           "Client": {"name": "Pranavesh", "number": "+12344464"}}, 
                 "hardware_parts": [{"item": "iPad 9.7inch", "quantity": 7}, 
                               {"item": "Stylish Aluminum Table Stand", "quantity": 7}]}
            ],
        )
    ],
    attributes=[
        Text(
            id="name",
            description="name of the client's event"
        ),
        Text(
            id="drop_off_date",
            description="Hardware dropp off date"
        ),
        Text(
            id="drop_off_time",
            description="Hardware dropp off time"
        ),
        Text(
            id="pick_up_date",
            description="Hardware pickup date"
        ),
        Text(
            id="pick_up_time",
            description="Hardware pickup time"
        ),
        Text(
            id="cvent_onsite_contact_name",
            description="Cvents technitian representative"
        ),
        Text(
            id="cvent_onsite_contact_number",
            description="Cvents representative contact number"
        ),
        Text(
            id="cvent_client_contact_name",
            description="Clients Name"
        ),
        Text(
            id="cvent_client_contact_number",
            description="Cvents client contact number"
        ),
        
        hardware_parts
    ]
)

In [23]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
text = "Event Name is Cvent 2023 Test event,Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST, Cvent Staff is Abhishek Rai, Cvent Staff Number is +123456344, Client Staff is Pranavesh, Client Staff Number is +12344464. Hardwares with Quantities as agreed upon are:iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7"
# Changed the encoder to json
chain = create_extraction_chain(llm, cars_schema, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{}


In [21]:
output


{'data': {},
 'raw': '<json>{"event": {"name": "Cvent 2023 Test event", "date/time": {"hardline dropped": "06/10/2023 9:00 AM EST", "event ended": "06/20/2023 4:00 PM EST"}, "staff": {"Cvent": {"name": "Abhishek Rai", "number": "+123456344"}, "Client": {"name": "Pranavesh", "number": "+12344464"}}, "hardwares": [{"item": "iPad 9.7inch", "quantity": 7}, {"item": "Stylish Aluminum Table Stand", "quantity": 7}]}}</json>',
 'errors': [kor.exceptions.ParseError('The LLM has returned structured data which does not match the expected schema. Providing additional examples may help improve the parse.')],
 'validated_data': {}}

In [24]:

{"name": "Cvent 2023 Test event", 
 "date/time": {"hardline dropped": "06/10/2023 9:00 AM EST", "event ended": "06/20/2023 4:00 PM EST"}, 
 "staff": {"Cvent": {"name": "Abhishek Rai", "number": "+123456344"}, "Client": {"name": "Pranavesh", "number": "+12344464"}}, 
 "hardwares": [{"item": "iPad 9.7inch", "quantity": 7}, {"item": "Stylish Aluminum Table Stand", "quantity": 7}]}

{'name': 'Cvent 2023 Test event',
 'date/time': {'hardline dropped': '06/10/2023 9:00 AM EST',
  'event ended': '06/20/2023 4:00 PM EST'},
 'staff': {'Cvent': {'name': 'Abhishek Rai', 'number': '+123456344'},
  'Client': {'name': 'Pranavesh', 'number': '+12344464'}},
 'hardwares': [{'item': 'iPad 9.7inch', 'quantity': 7},
  {'item': 'Stylish Aluminum Table Stand', 'quantity': 7}]}

In [25]:
hardware_items = Object(
    id="hardware_items",
    description="Information about hardwares",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="item",
            description="Name of hardware"
        ),
        Number(
            id="quantity",
            description="quantity of hardware"
        )
    ],
    examples=[
        (
            "Hardwares with Quantities as agreed upon are: iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7",
            [
                {"item": "iPad 9.7inch", "quantity" : 7},
                {"item": "Stylish Aluminum Table Stand",  "quantity" : 7},
            ],
        )
    ]
)


In [29]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
text = "Hardwares with Quantities as agreed upon are: Macbook 14inch - 7, Stylish BrassN Table Stand - 7"
# Changed the encoder to json
chain = create_extraction_chain(llm, hardware_items, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "hardware_items": [
      {
         "item": "Macbook 14inch",
         "quantity": 7
      },
      {
         "item": "Stylish BrassN Table Stand",
         "quantity": 7
      }
   ]
}


In [30]:
staff_details = Object(
    id="staff_details",
    description="Information about staff client & cvent",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="staff_type",
            description="Staff type"
        ),
        Text(
            id="name",
            description="Name of staff"
        ),
        Text(
            id="number",
            description="contact number of staff"
        )
    ],
    examples=[
        (
            "Cvent Staff is Abhishek Rai, Cvent Staff Number is +12344344, Client Staff is Pranavesh, Client Staff Number is +12344464",
            [
                {'staff_type':'cvent','name': 'Abhishek Rai', 'number': '+12344344'},
                {'staff_type':'client','name': 'Pranavesh', 'number': '+12344464'},
            ],
        )
    ]
)


In [31]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
text = "Cvent Staff is Abhishek Rai, Cvent Staff Number is +12344344, Client Staff is Pranavesh, Client Staff Number is +12344464"
# Changed the encoder to json
chain = create_extraction_chain(llm, staff_details, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "staff_details": [
      {
         "name": "Abhishek Rai",
         "number": "+12344344",
         "staff_type": "cvent"
      },
      {
         "name": "Pranavesh",
         "number": "+12344464",
         "staff_type": "client"
      }
   ]
}


In [32]:
date_time_objects = Object(
    id="date_time_objects",
    description="Information about pickup/drop date & times",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="pick_up_type",
            description="Drop/pickup type"
        ),
        Text(
            id="date",
            description="Drop/pickup date"
        ),
        Text(
            id="time",
            description="Drop/pickup time"
        )
    ],
    examples=[
        (
            "Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST",
            [
                {'pick_up_type':'Dropoff','date': '06/10/2023', 'time': '9:00 AM EST'},
                {'pick_up_type':'Pickup','date': '06/20/2023', 'time': '4:00 PM EST'},
            ],
        )
    ]
)


In [33]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
text = "Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST"
# Changed the encoder to json
chain = create_extraction_chain(llm, date_time_objects, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "date_time_objects": [
      {
         "date": "06/10/2023",
         "pick_up_type": "Dropoff",
         "time": "9:00 AM EST"
      },
      {
         "date": "06/20/2023",
         "pick_up_type": "Pickup",
         "time": "4:00 PM EST"
      }
   ]
}


In [93]:
## hardware_items
hardware_items = Object(
    id="hardware_items",
    description="Information about hardwares",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="item",
            description="Name of hardware"
        ),
        Number(
            id="quantity",
            description="quantity of hardware"
        )
    ],
    examples=[
        (
            "Hardwares with Quantities as agreed upon are: iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7",
            [
                {"item": "iPad 9.7inch", "quantity" : 7},
                {"item": "Stylish Aluminum Table Stand",  "quantity" : 7},
            ],
        )
    ]
)


## staff_details
staff_details = Object(
    id="staff_details",
    description="Information about staff client & cvent",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="staff_type",
            description="Staff type"
        ),
        Text(
            id="name",
            description="Name of staff"
        ),
        Text(
            id="number",
            description="contact number of staff"
        )
    ],
    examples=[
        (
            "Cvent Staff is Abhishek Rai, Cvent Staff Number is +12344344, Client Staff is Pranavesh, Client Staff Number is +12344464",
            [
                {'staff_type':'cvent','name': 'Abhishek Rai', 'number': '+12344344'},
                {'staff_type':'client','name': 'Pranavesh', 'number': '+12344464'},
            ],
        )
    ]
)


## Date time objects
date_time_objects = Object(
    id="date_time_objects",
    description="Information about pickup/drop date & times",
    
    # Notice I put multiple fields to pull out different attributes
    attributes=[
        Text(
            id="pick_up_type",
            description="Drop/pickup type"
        ),
        Text(
            id="date",
            description="Drop/pickup date"
        ),
        Text(
            id="time",
            description="Drop/pickup time"
        )
    ],
    examples=[
        (
            "Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST",
            [
                {'pick_up_type':'Dropoff','date': '06/10/2023', 'time': '9:00 AM EST'},
                {'pick_up_type':'Pickup','date': '06/20/2023', 'time': '4:00 PM EST'},
            ],
        )
    ]
)



##final events schema
final_events_schema = Object(
    id="final_events_schema",
    description="information about event, date/time, staff type and hardware details",
    examples=[
        (
            "Event Name is Cvent 2023 Test event,Date/time Hardline will be dropped - 06/10/2023 9:00 AM EST, Date/time Event ended - 06/20/2023 4:00 PM EST, Cvent Staff is Abhishek Rai, Cvent Staff Number is +123456344, Client Staff is Pranavesh, Client Staff Number is +12344464. Hardwares with Quantities as agreed upon are:iPad 9.7inch - 7, Stylish Aluminum Table Stand - 7",
            [
                {"event_name": "Cvent 2023 Test event", 
                 "date_time_objects": [
                     {"date": "06/10/2023","pick_up_type": "Dropoff","time": "9:00 AM EST"},
                     {"date": "06/20/2023","pick_up_type": "Pickup","time": "4:00 PM EST"}],
                 "staff_details": [{"name": "Abhishek Rai","number": "+12344344","staff_type": "cvent"},
                                   {"name": "Pranavesh","number": "+12344464","staff_type": "client"}],
                 "hardware_items": [{"item": "Macbook 14inch","quantity": 7},
                                    {"item": "Stylish BrassN Table Stand","quantity": 7}]
                }
            ],
        )
    ],
    attributes=[
        Text(
            id="event_name",
            description="Name of the onsite event"
        ),
        date_time_objects,
        staff_details,
        hardware_items
    ]
)

In [94]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
text = "Event Name is Cvent CCW 2023v2 event,Date/time Hardline will be dropped - 01/01/2023 12:02 PM EST, Date/time Event ended - 01/07/2023 8:00 PM EST, Cvent Staff is Dave Chatterjee, Cvent Staff Number is +123151515, Client Staff is Pranavesh Ramachandran, Client Staff Number is +121111164464. Hardwares with Quantities as agreed upon are: iPad 9.7inch - 10, Stylish Aluminum Table Stand - 17, Network Cable (15 - 20 Feet) - 20, Registration Wireless Router - 22, Network Cable (15 - 20 Feet) - 32, Network Cable (6 Feet) - 38, Network Cable (15 - 20 Feet) - 20, Network Cable (50 Feet) - 28"

# Changed the encoder to json
chain = create_extraction_chain(llm, final_events_schema, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "final_events_schema": {
      "event_name": "Cvent CCW 2023v2 event",
      "date_time_objects": [
         {
            "pick_up_type": "Dropoff",
            "date": "01/01/2023",
            "time": "12:02 PM EST"
         },
         {
            "pick_up_type": "Pickup",
            "date": "01/07/2023",
            "time": "8:00 PM EST"
         }
      ],
      "staff_details": [
         {
            "staff_type": "cvent",
            "name": "Dave Chatterjee",
            "number": "+123151515"
         },
         {
            "staff_type": "client",
            "name": "Pranavesh Ramachandran",
            "number": "+121111164464"
         }
      ],
      "hardware_items": [
         {
            "item": "iPad 9.7inch",
            "quantity": 10
         },
         {
            "item": "Stylish Aluminum Table Stand",
            "quantity": 17
         },
         {
            "item": "Network Cable (15 - 20 Feet)",
            "quantity": 20
         },
   

In [38]:
json_output = chain.predict_and_parse(text=text)


In [89]:
output['data']['final_events_schema'].keys()

dict_keys(['event_name', 'date_time_objects', 'staff_details', 'hardware_items'])

In [90]:
output['data']['final_events_schema']['event_name']

'Cvent CCW 2023v2 event'

In [91]:
output['data']['final_events_schema']['date_time_objects']

[{'pick_up_type': 'Dropoff', 'date': '01/01/2023', 'time': '12:02 PM EST'},
 {'pick_up_type': 'Pickup', 'date': '01/07/2023', 'time': '8:00 PM EST'}]