<a href="https://colab.research.google.com/github/aknip/Langchain-etc./blob/main/LLM_JSON_Extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai tiktoken litellm langchain kor

In [3]:
import json
import os
from getpass import getpass
import psutil
import requests
import textwrap
IN_NOTEBOOK = any(["jupyter-notebook" in i for i in psutil.Process().parent().cmdline()])
if IN_NOTEBOOK:
  CREDS = json.loads(getpass("Secrets (JSON string): "))
  os.environ['CREDS'] = json.dumps(CREDS)
  CREDS = json.loads(os.getenv('CREDS'))

Secrets (JSON string): ··········


In [4]:
from litellm import completion
import openai
os.environ["OPENAI_API_KEY"] = CREDS['OpenAI']['v1']['credential'] # my key
os.environ["TOGETHERAI_API_KEY"] = CREDS['together-ai']['key']['credential']

# 1. Extraction with KOR and Langchain

- KOR takes an schema (object) as target JSON
- Kor comes with built-in support for creating a schema “object” with fields of different types. Currently, Kor’s native support is limited to Object, Text, Number, Bool, and Selection input types.
- What sets Kor apart when creating schemas for LLMs is its ability to define a field’s purpose and context with textual descriptions and examples

Code:

1. Take input text and schema
2. Convert to prompt including few-shot-examples. Ouput is Excel-CSV !
3. Run prompt
3. Take output of prompt and onvert CSV to JSON

In [18]:
# Source: https://levelup.gitconnected.com/overcoming-challenges-of-llm-based-data-extraction-with-kor-1c0c6d4acd4a

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Selection
from langchain.chat_models import ChatOpenAI
from langchain.globals import set_debug
from langchain.globals import set_verbose

text = "Around the world, there are numerous captivating tourist destinations that offer unique attractions for " \
       "visitors. One such place is Paris, France, known as the 'City of Love.' The iconic Eiffel Tower stands tall, " \
       "providing panoramic views of the city, while the Louvre Museum houses world-renowned art masterpieces like " \
       "the Mona Lisa. Moving to the United States, New York City beckons with its dazzling Times Square, the Statue " \
       "of Liberty, and the vibrant Broadway shows. Meanwhile, in Asia, Kyoto, Japan, enchants with its ancient " \
       "temples, tranquil gardens, and traditional geisha culture. The Great Wall of China, a monumental feat of " \
       "engineering, winds its way across the vast Chinese landscape, offering breathtaking views and a glimpse into " \
       "the country's rich history. In South America, Rio de Janeiro, Brazil, captivates with its vibrant Carnival " \
       "celebrations, Copacabana Beach, and the iconic Christ the Redeemer statue atop Corcovado Mountain. Lastly, " \
       "Australia's Great Barrier Reef lures adventurers with its stunning coral reefs and diverse marine life, " \
       "while the Sydney Opera House showcases architectural brilliance. These destinations, among many others, " \
       "embody the beauty, culture, and history that make our world a fascinating place to explore."

schema = Object(
    id="destinations",
    description="Tourist destination information",
    examples=[
        (
            "Ubud is famous for its unique Balinese temples and beautiful rice terraces.",
            {"destination": "Ubud", "attractions": "rice terraces, Balinese temples"}
        ),
        (
            "Tourists flock to Galle, Sri Lanka to enjoy a relaxing beach vacation.",
            {"destination": "Galle", "country": "Sri Lanka", "attractions": "beaches"}
        )
    ],
    attributes=[
        Text(
            id="destination",
            description="The name of the tourist destination",
            examples=[
                ("Thailand's Phuket island is a favorite among tourists", "Phuket")
            ]
        ),
        Text(
            id="country",
            description="The country the tourist destination is located in",
            examples=[
                ("Thailand's Phuket island is a favorite among tourists", "Thailand")
            ]
        ),
        Text(
            id="attractions",
            description="A comma separated list of attractions in the destination",
            examples=[
                ("Phuket is popular for beautiful beaches and vibrant night life", "beautiful beaches, vibrant night life")
            ]
        )
    ],
    many=True
)

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
)

set_debug(True)

chain = create_extraction_chain(llm, schema)
response = chain.run(text)

[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "text": "Around the world, there are numerous captivating tourist destinations that offer unique attractions for visitors. One such place is Paris, France, known as the 'City of Love.' The iconic Eiffel Tower stands tall, providing panoramic views of the city, while the Louvre Museum houses world-renowned art masterpieces like the Mona Lisa. Moving to the United States, New York City beckons with its dazzling Times Square, the Statue of Liberty, and the vibrant Broadway shows. Meanwhile, in Asia, Kyoto, Japan, enchants with its ancient temples, tranquil gardens, and traditional geisha culture. The Great Wall of China, a monumental feat of engineering, winds its way across the vast Chinese landscape, offering breathtaking views and a glimpse into the country's rich history. In South America, Rio de Janeiro, Brazil, captivates with its vibrant Carnival celebrations, Copacabana Beach, and the icon

In [19]:
print(json.dumps(response, indent=4))

{
    "data": {
        "destinations": [
            {
                "destination": "Paris",
                "country": "France",
                "attractions": "Eiffel Tower, Louvre Museum"
            },
            {
                "destination": "New York City",
                "country": "United States",
                "attractions": "Times Square, Statue of Liberty, Broadway shows"
            },
            {
                "destination": "Kyoto",
                "country": "Japan",
                "attractions": "ancient temples, tranquil gardens, traditional geisha culture"
            },
            {
                "destination": "Great Wall of China",
                "country": "China",
                "attractions": "monumental feat of engineering, breathtaking views, rich history"
            },
            {
                "destination": "Rio de Janeiro",
                "country": "Brazil",
                "attractions": "Carnival celebrations, Copacabana Beach,

The response object includes
- "raw": The LLM-response as CSV, delimited by |
- "data": The resonse as JSON (converted from the CSV)


## 1.1. Simulate KOR

The generated prompt (see above) looks like this:

````
System: Your goal is to extract structured information from the user's input
that matches the form described below. When extracting information please make
sure it matches the type information exactly. Do not add any attributes that do
not appear in the schema shown below.

```TypeScript

destinations: Array<{ // Tourist destination information
 destination: string // The name of the tourist destination
 country: string // The country the tourist destination is located in
 attractions: string // A comma separated list of attractions in the destination
}>
```

Please output the extracted information in CSV format in Excel dialect. Please
use a | as the delimiter.
Do NOT add any clarifying information. Output MUST follow the schema above. Do
NOT add any additional columns that do not appear in the schema.


Human: Ubud is famous for its unique Balinese temples and beautiful rice
terraces.
AI: destination|country|attractions
Ubud||rice terraces, Balinese temples

Human: Tourists flock to Galle, Sri Lanka to enjoy a relaxing beach vacation.
AI: destination|country|attractions
Galle|Sri Lanka|beaches

Human: Thailand's Phuket island is a favorite among tourists
AI: destination|country|attractions
Phuket||

Human: Thailand's Phuket island is a favorite among tourists
AI: destination|country|attractions
|Thailand|

Human: Phuket is popular for beautiful beaches and vibrant night life
AI: destination|country|attractions
||beautiful beaches, vibrant night life

Human: Around the world, there are numerous captivating tourist destinations
that offer unique attractions for visitors. One such place is Paris, France,
known as the 'City of Love.' The iconic Eiffel Tower stands tall, providing
panoramic views of the city, while the Louvre Museum houses world-renowned art
masterpieces like the Mona Lisa. Moving to the United States, New York City
beckons with its dazzling Times Square, the Statue of Liberty, and the vibrant
Broadway shows. Meanwhile, in Asia, Kyoto, Japan, enchants with its ancient
temples, tranquil gardens, and traditional geisha culture. The Great Wall of
China, a monumental feat of engineering, winds its way across the vast Chinese
landscape, offering breathtaking views and a glimpse into the country's rich
history. In South America, Rio de Janeiro, Brazil, captivates with its vibrant
Carnival celebrations, Copacabana Beach, and the iconic Christ the Redeemer
statue atop Corcovado Mountain. Lastly, Australia's Great Barrier Reef lures
adventurers with its stunning coral reefs and diverse marine life, while the
Sydney Opera House showcases architectural brilliance. These destinations, among
many others, embody the beauty, culture, and history that make our world a
fascinating place to explore.
````

Let's go through it step by step:

**First prompt part (static)**

Standard beginning of prompt:
````
System: Your goal is to extract structured information from the user's input
that matches the form described below. When extracting information please make
sure it matches the type information exactly. Do not add any attributes that do
not appear in the schema shown below.**
````

**Second prompt part (dynamic)**

Based on the schema defintion a TypeScript object is added to the prompt:
````
```TypeScript

destinations: Array<{ // Tourist destination information
 destination: string // The name of the tourist destination
 country: string // The country the tourist destination is located in
 attractions: string // A comma separated list of attractions in the destination
}>
````

This is the corresponding schema defintion in the code. The examples are added later (see part four):
````
schema = Object(
    id="destinations",
    description="Tourist destination information",
    attributes=[
        Text(
            id="destination",
            description="The name of the tourist destination",
            examples=[
                ("Thailand's Phuket island is a favorite among tourists", "Phuket")
            ]
        ),
        Text(
            id="country",
            description="The country the tourist destination is located in",
            examples=[
                ("Thailand's Phuket island is a favorite among tourists", "Thailand")
            ]
        ),
        Text(
            id="attractions",
            description="A comma separated list of attractions in the destination",
            examples=[
                ("Phuket is popular for beautiful beaches and vibrant night life", "beautiful beaches, vibrant night life")
            ]
        )
    ]
)
````

**Third prompt part (static)**

Instructions for formatting the output are added to the prompt:
````
Please output the extracted information in CSV format in Excel dialect. Please
use a | as the delimiter.
Do NOT add any clarifying information. Output MUST follow the schema above. Do
NOT add any additional columns that do not appear in the schema.
````

**Fourth prompt part (dynamic)**

Few-shot examples in "Human:" - "AI:" chat pattern:
````
Human: Ubud is famous for its unique Balinese temples and beautiful rice
terraces.
AI: destination|country|attractions
Ubud||rice terraces, Balinese temples

Human: Tourists flock to Galle, Sri Lanka to enjoy a relaxing beach vacation.
AI: destination|country|attractions
Galle|Sri Lanka|beaches

Human: Thailand's Phuket island is a favorite among tourists
AI: destination|country|attractions
Phuket||

Human: Thailand's Phuket island is a favorite among tourists
AI: destination|country|attractions
|Thailand|

Human: Phuket is popular for beautiful beaches and vibrant night life
AI: destination|country|attractions
||beautiful beaches, vibrant night life
````
The examples are compiled from the object and attribute examples of the code:
````
examples=[
  (
      "Ubud is famous for its unique Balinese temples and beautiful rice terraces.",
      {"destination": "Ubud", "attractions": "rice terraces, Balinese temples"}
  ),
  (
      "Tourists flock to Galle, Sri Lanka to enjoy a relaxing beach vacation.",
      {"destination": "Galle", "country": "Sri Lanka", "attractions": "beaches"}
  )
],
...
attributes=[
  Text(
      id="destination",
      description="The name of the tourist destination",
      examples=[
          ("Thailand's Phuket island is a favorite among tourists", "Phuket")
      ]
  )
...
````
**Fifth prompt part (dynmaic)**

The input text
````
Human: Around the world, there are numerous captivating tourist destinations
that offer unique attractions for visitors. One such place is Paris, France,
known as the 'City of Love.' The iconic Eiffel Tower stands tall, providing
panoramic views of the city, while the Louvre Museum houses world-renowned art
masterpieces like the Mona Lisa. Moving to the United States, New York City
beckons with its dazzling Times Square, the Statue of Liberty, and the vibrant
Broadway shows. Meanwhile, in Asia, Kyoto, Japan, enchants with its ancient
temples, tranquil gardens, and traditional geisha culture. The Great Wall of
China, a monumental feat of engineering, winds its way across the vast Chinese
landscape, offering breathtaking views and a glimpse into the country's rich
history. In South America, Rio de Janeiro, Brazil, captivates with its vibrant
Carnival celebrations, Copacabana Beach, and the iconic Christ the Redeemer
statue atop Corcovado Mountain. Lastly, Australia's Great Barrier Reef lures
adventurers with its stunning coral reefs and diverse marine life, while the
Sydney Opera House showcases architectural brilliance. These destinations, among
many others, embody the beauty, culture, and history that make our world a
fascinating place to explore.

````





Let's take the prompt generated by KOR and feed it directly to an LLM:

In [21]:
kor_prompt_text = "System: Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.\n\n```TypeScript\n\ndestinations: Array<{ // Tourist destination information\n destination: string // The name of the tourist destination\n country: string // The country the tourist destination is located in\n attractions: string // A comma separated list of attractions in the destination\n}>\n```\n\n\nPlease output the extracted information in CSV format in Excel dialect. Please use a | as the delimiter. \n Do NOT add any clarifying information. Output MUST follow the schema above. Do NOT add any additional columns that do not appear in the schema.\n\n\nHuman: Ubud is famous for its unique Balinese temples and beautiful rice terraces.\nAI: destination|country|attractions\nUbud||rice terraces, Balinese temples\n\nHuman: Tourists flock to Galle, Sri Lanka to enjoy a relaxing beach vacation.\nAI: destination|country|attractions\nGalle|Sri Lanka|beaches\n\nHuman: Thailand's Phuket island is a favorite among tourists\nAI: destination|country|attractions\nPhuket||\n\nHuman: Thailand's Phuket island is a favorite among tourists\nAI: destination|country|attractions\n|Thailand|\n\nHuman: Phuket is popular for beautiful beaches and vibrant night life\nAI: destination|country|attractions\n||beautiful beaches, vibrant night life\n\nHuman: Around the world, there are numerous captivating tourist destinations that offer unique attractions for visitors. One such place is Paris, France, known as the 'City of Love.' The iconic Eiffel Tower stands tall, providing panoramic views of the city, while the Louvre Museum houses world-renowned art masterpieces like the Mona Lisa. Moving to the United States, New York City beckons with its dazzling Times Square, the Statue of Liberty, and the vibrant Broadway shows. Meanwhile, in Asia, Kyoto, Japan, enchants with its ancient temples, tranquil gardens, and traditional geisha culture. The Great Wall of China, a monumental feat of engineering, winds its way across the vast Chinese landscape, offering breathtaking views and a glimpse into the country's rich history. In South America, Rio de Janeiro, Brazil, captivates with its vibrant Carnival celebrations, Copacabana Beach, and the iconic Christ the Redeemer statue atop Corcovado Mountain. Lastly, Australia's Great Barrier Reef lures adventurers with its stunning coral reefs and diverse marine life, while the Sydney Opera House showcases architectural brilliance. These destinations, among many others, embody the beauty, culture, and history that make our world a fascinating place to explore."
#for x in kor_prompt_text.split('\n'):
#  print(textwrap.fill(x, 80))
response = completion(
  model="gpt-3.5-turbo",
  messages=[{ "content": kor_prompt_text,"role": "user"}]
)
print(response.choices[0].message.content)
#print(response)

AI: destination|country|attractions
Paris|France|Eiffel Tower, Louvre Museum
New York City|United States|Times Square, Statue of Liberty, Broadway shows
Kyoto|Japan|ancient temples, tranquil gardens, traditional geisha culture
Great Wall of China|China|
Rio de Janeiro|Brazil|vibrant Carnival celebrations, Copacabana Beach, Christ the Redeemer statue
Great Barrier Reef|Australia|stunning coral reefs, diverse marine life
Sydney Opera House|Australia|


# Function Calling

In [None]:
# see https://litellm.vercel.app/docs/completion/function_call

# via Huggingface?
# https://litellm.vercel.app/docs/providers/huggingface
# https://huggingface.co/Trelis/Mixtral-8x7B-Instruct-v0.1-function-calling-v3
# https://huggingface.co/Trelis/Mistral-7B-Instruct-v0.1-function-calling-v2

# via Anyscale?
# https://docs.litellm.ai/docs/providers/anyscale
# https://www.anyscale.com/blog/anyscale-endpoints-json-mode-and-function-calling-features

In [5]:
import os, litellm
from litellm import completion

# IMPORTANT - Set this to TRUE to add the function to the prompt for Non OpenAI LLMs
litellm.add_function_to_prompt = True

# The real function is not needed for the LLM. It may be called after the LLM call (not in this code!)
def get_current_weather(location):
  if location == "Boston, MA":
    return "The weather is 12F"

functions = [
    {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": ["celsius", "fahrenheit"]
          }
        },
        "required": ["location"]
      }
    }
  ]

messages = [
    {"role": "user", "content": "What is the weather like in Boston?"}
]

response = completion(model="gpt-3.5-turbo-1106", messages=messages, functions=functions)

print(response)
print()
function_found = hasattr(response.choices[0]['message'], 'function_call')
if function_found == True:
  function_call = response.choices[0]['message']['function_call']
  function_call_name = function_call.name
  function_call_arguments = function_call.arguments
  print(function_call_name)
else:
  print('No function found')

ModelResponse(id='chatcmpl-8boD672S6ONQrFt6DEiCw6gCAv3Hs', choices=[Choices(finish_reason='function_call', index=0, message=Message(content=None, role='assistant', function_call=FunctionCall(arguments='{"location":"Boston, MA"}', name='get_current_weather')))], created=1704021836, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint='fp_772e8125bb', usage=Usage(completion_tokens=17, prompt_tokens=82, total_tokens=99), _response_ms=820.1850000000001)

get_current_weather
