In [1]:
from llama_index.query_pipeline import (QueryPipeline as QP, Link, InputComponent)
from llama_index.query_engine.pandas import PandasInstructionParser
from llama_index.llms import OpenAI
from llama_index.prompts import PromptTemplate

In [2]:
!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/csv/titanic_train.csv' -O 'titanic_train.csv'


/bin/bash: wget: command not found


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('./titanic_train.csv')

### Define Modules

In [7]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n")

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:")

response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: ")

pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(instruction_str=instruction_str,
                                                                df_str = df.head())

pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = OpenAI(model='gpt-3.5-turbo')

### Build Query Pipeline

In [8]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    }, verbose=True)

qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
         Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ])
qp.add_link('response_synthesis_prompt','llm2')

## Run query

In [11]:
response = qp.run(
    query_str ="What is the oldest surviver?")

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the oldest surviver?

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the oldest surviver?

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   survived  pclass  ... cabin embarked
0         0       3  ...   NaN  ...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df[df['survived'] == 1]['age'].max()

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: What is the oldest surviver?
pandas_instructions: assistant: df[df['survived'] == 1]['age'].max()
pandas_output: 80.0

[0m[1;3;38;2;155;135;227m> Running module llm2 with input: 
messages: Given an input question, synthesize a response from the query results.
Query: What is the ol

In [12]:
print(response.message.content)

The oldest survivor is 80 years old.
