In [2]:
# install llama-index to the local environment
! pip3 install llama-index

Collecting llama-index
  Obtaining dependency information for llama-index from https://files.pythonhosted.org/packages/0d/40/f1c7ac9d989c7d798ed45b54960df277091fb0255f832c4e563da4eaf381/llama_index-0.9.40-py3-none-any.whl.metadata
  Downloading llama_index-0.9.40-py3-none-any.whl.metadata (8.4 kB)
Collecting SQLAlchemy[asyncio]>=1.4.49 (from llama-index)
  Obtaining dependency information for SQLAlchemy[asyncio]>=1.4.49 from https://files.pythonhosted.org/packages/3f/ec/d285f944d47d885a84fdd2c2c7b1de10a106a7206a7d56325b772d09ffeb/SQLAlchemy-2.0.25-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading SQLAlchemy-2.0.25-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.6 (from llama-index)
  Obtaining dependency information for aiohttp<4.0.0,>=3.8.6 from https://files.pythonhosted.org/packages/d3/b0/efb74d5f92a460c774e0254b3109c2d00fd3a1553f98363abb2b25cac9a3/aiohttp-3.9.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading aiohttp-3.9.3-cp311-cp3

Collecting greenlet!=0.4.17 (from SQLAlchemy[asyncio]>=1.4.49->llama-index)
  Obtaining dependency information for greenlet!=0.4.17 from https://files.pythonhosted.org/packages/6e/20/68a278a6f93fa36e21cfc3d7599399a8a831225644eb3b6b18755cd3d6fc/greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl.metadata
  Downloading greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl.metadata (3.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.8.0->llama-index)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json->llama-index)
  Obtaining dependency information for marshmallow<4.0.0,>=3.18.0 from https://files.pythonhosted.org/packages/57/e9/4368d49d3b462da16a3bac976487764a84dd85cef97232c7bd61f5bdedf3/marshmallow-3.20.2-py3-none-any.whl.metadata
  Downloading marshmallow-3.20.2-py3-none-any.whl.metadata (7.5 kB)
Collecting pytz>=2020.1 (from pandas->llama-index)
  Obtaining dependency information for pytz>=2020.1 

In [31]:
# get api_key from .env
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [32]:
from llama_index.program import (
    OpenAIPydanticProgram,
    DataFrame,
    DataFrameRowsOnly,
)
from llama_index.llms import OpenAI

In [33]:
to_dataframe = OpenAIPydanticProgram.from_defaults(
    output_cls=DataFrame,
    llm=OpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt_template_str=(
        "Please extract the following query into a structured data according"
        " to: {input_str}.Please extract both the set of column names and a"
        " set of rows."
    ),
    verbose=True,
)

In [34]:
response = to_dataframe(
    input_str="""My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago."""
)
response

Function call: DataFrame with args: {
  "columns": [
    {
      "column_name": "Name",
      "column_desc": "Name of the person"
    },
    {
      "column_name": "Age",
      "column_desc": "Age of the person"
    },
    {
      "column_name": "Location",
      "column_desc": "Location of the person"
    },
    {
      "column_name": "Hobby",
      "column_desc": "Hobby of the person"
    }
  ],
  "rows": [
    {
      "row_values": ["John", 25, "New York", "Basketball"]
    },
    {
      "row_values": ["Mike", 30, "San Francisco", "Baseball"]
    },
    {
      "row_values": ["Sarah", 20, "Los Angeles", "Tennis"]
    },
    {
      "row_values": ["Mary", 35, "Chicago", ""]
    }
  ]
}


DataFrame(description=None, columns=[DataFrameColumn(column_name='Name', column_desc='Name of the person'), DataFrameColumn(column_name='Age', column_desc='Age of the person'), DataFrameColumn(column_name='Location', column_desc='Location of the person'), DataFrameColumn(column_name='Hobby', column_desc='Hobby of the person')], rows=[DataFrameRow(row_values=['John', 25, 'New York', 'Basketball']), DataFrameRow(row_values=['Mike', 30, 'San Francisco', 'Baseball']), DataFrameRow(row_values=['Sarah', 20, 'Los Angeles', 'Tennis']), DataFrameRow(row_values=['Mary', 35, 'Chicago', ''])])

In [35]:
response.to_df()


Unnamed: 0,Name,Age,Location,Hobby
0,John,25,New York,Basketball
1,Mike,30,San Francisco,Baseball
2,Sarah,20,Los Angeles,Tennis
3,Mary,35,Chicago,


In [36]:
to_dataframe_rowsonly = OpenAIPydanticProgram.from_defaults(
    output_cls=DataFrameRowsOnly,
    llm=OpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt_template_str=(
        "Please extract the following text into a structured data:"
        " {input_str}. The column names are the following: ['Name', 'Age',"
        " 'City', 'Favorite Sport']. Do not specify additional parameters that"
        " are not in the function schema. "
    ),
    verbose=True,
)

In [37]:
response_2 = to_dataframe_rowsonly(
    input_str="""My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago."""
)
response_2

Function call: DataFrameRowsOnly with args: {
  "rows": [
    {
      "row_values": ["John", 25, "New York", "Basketball"]
    },
    {
      "row_values": ["Mike", 30, "San Francisco", "Baseball"]
    },
    {
      "row_values": ["Sarah", 20, "Los Angeles", "Tennis"]
    },
    {
      "row_values": ["Mary", 35, "Chicago", ""]
    }
  ]
}


DataFrameRowsOnly(rows=[DataFrameRow(row_values=['John', 25, 'New York', 'Basketball']), DataFrameRow(row_values=['Mike', 30, 'San Francisco', 'Baseball']), DataFrameRow(row_values=['Sarah', 20, 'Los Angeles', 'Tennis']), DataFrameRow(row_values=['Mary', 35, 'Chicago', ''])])

In [38]:
response_2.to_df()


Unnamed: 0,0,1,2,3
0,John,25,New York,Basketball
1,Mike,30,San Francisco,Baseball
2,Sarah,20,Los Angeles,Tennis
3,Mary,35,Chicago,
