# 1. Extract Data using Kor
-  Kor is a thin wrapper to extract structured data.

### 1.1 Load Library

In [4]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

from openai_keys import *

# It's better to do this an environment variable but putting it in plain text for clarity
openai_api_key_bikal_personal = OPENAI_KEY_BIKAL

In [5]:
import json

def printOutput(output):
    print(json.dumps(output,sort_keys=True, indent=3))

def get_cost_estimate(model, no_of_tokens):
    # Per 1k cost
    model_per_1k_cost = {
        'text-babbage-001' : 0.0005,
        'text-ada-001'  : 0.0004,
        'text-curie-001': 0.002,
        'gpt-3.5-turbo' : 0.002, 
        'gpt-3.5-turbo-0301':0.002,
        'text-davinci-003': 0.02,
        'text-davinci-002':0.02,
        'gpt-4'         : 0.06,
        'gpt-4-0314'    : 0.06,
        'gpt-4-32k'     : 0.12,
        'gpt-4-32k-0314': 1.12
    } 
    
    return f'{no_of_tokens  * model_per_1k_cost[model] / (1000)} cents'
        

get_cost_estimate('gpt-3.5-turbo', 500)

'0.001 cents'

### 1.2  Define Extraction Schema  Simple
- Single object : Car
- Singleton  attributes: e.g Car Color, Car engine.
- Inheritance / Composition : A vehicle is composed of multiple parts

##### 1.2.1 Complex Schema : Composition:
- Composition : e.g A car is composed of multiple parts
e.g The blue jeep has rear view mirror, roof, windshield into
```{
   "car": {
      "color": "blue",
      "parts": [
         {
            "part": "rear view mirror"
         },
         {
            "part": "roof"
         },
         {
            "part": "windshield"
         }
      ],
      "type": "jeep"
   }
}
```

Ref : https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Expert%20Structured%20Output%20(Using%20Kor).ipynb

In [6]:
# The chemical or molecular formulae of the chemical compound.
chemicals = Object(
    id="chemicals",
    description="List of  chemical name or  a chemical's molecular formulae",
    attributes = [Text(id="chemical", description="Chemical's molecular formulae or chemical name") ],
    examples = [("The BaCO<sub>3 </sub>and Sodium hypochlorite",
                 [{"chemical":"BaCO<sub>3 </sub>"}, {"chemical":"Sodium hypochlorite"}],
    )]
)
chemical_measurement_schema = Object(
    id="measurement_and_values",
    description="Measurement  information of Chemicals ",    
    attributes=[
        chemicals,
        Text(
            id="mechanical_properties",
            description="The mechanical property of the chemical being measured.",
        ),
        Text(
            id="measurement_unit",
            description="Unit of measurement for the chemical.",
        ),
        Text(
            id="measured_value_low",
            description="Lower range value for the chemical compound.",
        ),
        Text(
            id="measured_value_high",
            description="Higher range value for the chemical compound.",
        )
    ],
    # Including both examples and description will likely improve performance.
    # The resulting BaCO3 had a crystallite size of between about 20 and 40 nm.
    # The BaCO<sub>3 </sub>and CeO<sub>2 </sub>crystallites formed particles with a size of between about 5 and 50 microns.
    examples=[
        ("The BaCO<sub>3 </sub>and CeO<sub>2 </sub>crystallites formed particles with a size of between about 5 and 50 microns.",
         [{"chemical"               : ["BaCO<sub>3", "CeO<sub>3"],
           "mechanical_properties"  : "crystallite size",
           "measurement_unit"       : "nm",
           "measured_value_low"     : "20",
           "measured_value_high"    : "40"
           }            
          ])
    ],
    # many=False, # If the  text contains multiple chemical names, then multiple objects will be extrtacted.
)
print(chemical_measurement_schema)

id='measurement_and_values' description='Measurement  information of Chemicals ' many=False attributes=[Object(id='chemicals', description="List of  chemical name or  a chemical's molecular formulae", many=False, attributes=[Text(id='chemical', description="Chemical's molecular formulae or chemical name", many=False, examples=())], examples=[('The BaCO<sub>3 </sub>and Sodium hypochlorite', [{'chemical': 'BaCO<sub>3 </sub>'}, {'chemical': 'Sodium hypochlorite'}])]), Text(id='mechanical_properties', description='The mechanical property of the chemical being measured.', many=False, examples=()), Text(id='measurement_unit', description='Unit of measurement for the chemical.', many=False, examples=()), Text(id='measured_value_low', description='Lower range value for the chemical compound.', many=False, examples=()), Text(id='measured_value_high', description='Higher range value for the chemical compound.', many=False, examples=())] examples=[('The BaCO<sub>3 </sub>and CeO<sub>2 </sub>crysta

## Create Langchain model
- 14:13. wait 5 minutes until 14:20

In [7]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo", #  lowest cost among gpt - 3.5 models
    # model_name="gpt-4", # State of art but expensive
    temperature=0, # setting 0, ensures 0 creativity i.e  select only the data from the text and no hallucination or 0 fill 
    # max_tokens=4096,
    openai_api_key=openai_api_key_bikal_personal
)
chain = create_extraction_chain(llm, chemical_measurement_schema, encoder_or_encoder_class="json")

In [5]:
prompt_string = chain.prompt.format_prompt(text="..").to_string()
print(f" Your prompt length is  {len(prompt_string)}")
# https://platform.openai.com/docs/models/gpt-4
max_token_supported_by_gpt_3_dot_5 = 4096
print(f" Your available user input length is  {4096 - len(prompt_string)}")
print(f"Youer prompt string is \n {prompt_string}")


 Your prompt length is  1767
 Your available user input length is  2329
Youer prompt string is 
 Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

measurement_and_values: { // Measurement  information of Chemicals 
 chemicals: { // A Single chemical name or  a chemical's molecular formulae
  chemical: string // Chemical's molecular formulae or chemical name
 }
 mechanical_properties: string // The mechanical property of the chemical being measured.
 measurement_unit: string // Unit of measurement for the chemical.
 measured_value_low: string // Lower range value for the chemical compound.
 measured_value_high: string // Higher range value for the chemical compound.
}
```


Please output the extracted information in JSON format. Do not output anything except 

In [12]:
# For token counting
from langchain.callbacks import get_openai_callback

def _extract(document_text, chain, get_stats=False):
    with get_openai_callback() as cb:
        output = chain.predict_and_parse(text=(test_text2))
        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Prompt Tokens: {cb.prompt_tokens}")
        print(f"Completion Tokens: {cb.completion_tokens}")
        print(f"Successful Requests: {cb.successful_requests}")
        print(f"Total Cost (USD): {cb.total_cost} cents")
        # printOutput(output["data"])
        print(output)
    return output["data"]

### 1.4 Extraction : Simple Chemical attributes

In [13]:
test_text1 = "The BaCO3 that was produced had a crystallite dimension ranging from approximately 20 to 40 nm."
output = chain.predict_and_parse(text=(test_text1))
printOutput(output["data"])
print(output)
# printOutput(output['data'])
# Notice how there isn't "spot" in the results list because it's the name of a dog, not a person.

--------- USER  None
prompt---------------  [ExtractionPromptValue(string='Your goal is to extract structured information from the user\'s input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.\n\n```TypeScript\n\nmeasurement_and_values: { // Measurement  information of Chemicals \n chemicals: { // A Single chemical name or  a chemical\'s molecular formulae\n  chemical: string // Chemical\'s molecular formulae or chemical name\n }\n mechanical_properties: string // The mechanical property of the chemical being measured.\n measurement_unit: string // Unit of measurement for the chemical.\n measured_value_low: string // Lower range value for the chemical compound.\n measured_value_high: string // Higher range value for the chemical compound.\n}\n```\n\n\nPlease output the extracted information in JSON format. Do not output anything except for

In [14]:
test_text2 = "The O2 and Carbon Dioxide that was produced had a crystallite dimension ranging from approximately 20 to 40 nm."
_extract(test_text2, chain, get_stats=True)


--------- USER  None
prompt---------------  [ExtractionPromptValue(string='Your goal is to extract structured information from the user\'s input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.\n\n```TypeScript\n\nmeasurement_and_values: { // Measurement  information of Chemicals \n chemicals: { // A Single chemical name or  a chemical\'s molecular formulae\n  chemical: string // Chemical\'s molecular formulae or chemical name\n }\n mechanical_properties: string // The mechanical property of the chemical being measured.\n measurement_unit: string // Unit of measurement for the chemical.\n measured_value_low: string // Lower range value for the chemical compound.\n measured_value_high: string // Higher range value for the chemical compound.\n}\n```\n\n\nPlease output the extracted information in JSON format. Do not output anything except for

{'measurement_and_values': [{'chemical': ['O2', 'Carbon Dioxide'],
   'mechanical_properties': 'crystallite dimension',
   'measurement_unit': 'nm',
   'measured_value_low': '20',
   'measured_value_high': '40'}]}

In [17]:
get_cost_estimate('gpt-3.5-turbo', 513)

'0.001026 cents'