In [None]:
!pip install llama-index-llms-openai llama-index-experimental

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.2.16-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-experimental
  Downloading llama_index_experimental-0.4.0-py3-none-any.whl.metadata (884 bytes)
Collecting llama-index-core<0.12.0,>=0.11.7 (from llama-index-llms-openai)
  Downloading llama_index_core-0.11.22-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-finetuning<0.3.0,>=0.2.0 (from llama-index-experimental)
  Downloading llama_index_finetuning-0.2.1-py3-none-any.whl.metadata (992 bytes)
Collecting dataclasses-json (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting nltk>3.8.1 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading nltk-3.9.1-py3-none-any.whl

In [None]:
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.experimental.query_engine.pandas import (
    PandasInstructionParser,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
import pandas as pd
import openai

In [None]:
!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/docs/examples/data/csv/titanic_train.csv' -O 'titanic_train.csv'

--2024-11-07 20:15:36--  https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/docs/examples/data/csv/titanic_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57726 (56K) [text/plain]
Saving to: ‘titanic_train.csv’


2024-11-07 20:15:36 (5.54 MB/s) - ‘titanic_train.csv’ saved [57726/57726]



In [None]:
df = pd.read_csv("./titanic_train.csv")

In [10]:
#df = pd.read_csv("/content/sample_data/OTL_edited.csv")

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:

openai.api_key = ""

In [13]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = OpenAI(model="gpt-3.5-turbo")

In [14]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")

In [None]:
response = qp.run(
    query_str="What is the correlation between survival and gender",
)

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the correlation between survival and gender

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the correlation between survival and gender

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   survived  pclass                                               name  ...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df['survived'].corr(df['sex'].astype('category').cat.codes)

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: What is the correlation between survival and gender
pandas_instructions: assistant: df['survived'].corr(df['sex'].astype('category').cat.codes)
pandas_output: -0.5433513806577553

[0m[1;3;38;2;155;135;227m> Runni

In [None]:
print(response.message.content)

The correlation between survival and gender is -0.54, indicating a moderate negative correlation. This suggests that gender does have an impact on survival rates, with females more likely to survive compared to males.


In [15]:
#response = qp.run(
#    query_str="What is the mean of copyright of books",
#)

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the mean of copyright of books

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the mean of copyright of books

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   OTL ID                               Title  Copyright Year  \
0     4...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df['Copyright Year'].mean()

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: What is the mean of copyright of books
pandas_instructions: assistant: df['Copyright Year'].mean()
pandas_output: 2013.3578595317726

[0m[1;3;38;2;155;135;227m> Running module llm2 with input: 
messages: Given an input question, synthesize a response from the query resu

In [16]:
#print(response.message.content)

The mean copyright year of books is approximately 2013.36.
