In [4]:
import pandas as pd
import pyreadstat
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import utils

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
df_sch, meta_sch = pyreadstat.read_sav('data/PISA2022_SCH_QQQ.SAV')

In [6]:
import os
with open('data/openai.api.key', 'r') as filek: 
    openai_key = filek.read()
os.environ["OPENAI_API_KEY"] =  openai_key 

In [7]:
cols = utils.meta2docs(meta_sch, excluded=['test', 'CNTRYID'])

cols_vectorstore = Chroma.from_documents(cols, OpenAIEmbeddings(), persist_directory="./chroma_db_cols_vectorstore_sch1")

cols_retriever = cols_vectorstore.as_retriever()

In [8]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

In [9]:
utils.pipeline(llm, 'how many different countries were involved in the survey?', df_sch, meta_sch, cols_retriever)

- Country
hint - Country
['REGION', 'OECD', 'SCHLTYPE', 'CNT']
content="def run(df):\n    return float(df['CNT'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 168, 'total_tokens': 185}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-57fb3c9d-7227-4ef8-912c-e452262af065-0' usage_metadata={'input_tokens': 168, 'output_tokens': 17, 'total_tokens': 185}
def run(df):
    return float(df['CNT'].dropna().nunique())


{'question': 'how many different countries were involved in the survey?',
 'result': 80.0,
 'hint_cols': ['REGION', 'OECD', 'SCHLTYPE', 'CNT']}

In [10]:
utils.pipeline(llm, 'how many countries were involved in the survey?', df_sch, meta_sch, cols_retriever)

- Country Name
hint - Country Name
['CNT', 'REGION', 'OECD', 'NatCen']
content="def run(df):\n    return float(df['CNT'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 170, 'total_tokens': 187}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-96b274e3-875b-491e-9afc-9fc76875424f-0' usage_metadata={'input_tokens': 170, 'output_tokens': 17, 'total_tokens': 187}
def run(df):
    return float(df['CNT'].dropna().nunique())


{'question': 'how many countries were involved in the survey?',
 'result': 80.0,
 'hint_cols': ['CNT', 'REGION', 'OECD', 'NatCen']}

In [11]:
utils.pipeline(llm, 'how many countries are in the survey?', df_sch, meta_sch, cols_retriever)

- Country Name
- Country Code
hint - Country Name
['CNT', 'REGION', 'OECD', 'NatCen']
hint - Country Code
['CNT', 'NatCen', 'SUBNATIO', 'OECD']
content="def run(df):\n    return float(df['CNT'].nunique())" response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 278, 'total_tokens': 292}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-0bf73dab-b228-417e-a2bb-06053aa4ff38-0' usage_metadata={'input_tokens': 278, 'output_tokens': 14, 'total_tokens': 292}
def run(df):
    return float(df['CNT'].nunique())


{'question': 'how many countries are in the survey?',
 'result': 80.0,
 'hint_cols': ['CNT',
  'REGION',
  'OECD',
  'NatCen',
  'CNT',
  'NatCen',
  'SUBNATIO',
  'OECD']}

In [12]:
utils.pipeline(llm, 'how many distinct countries were involved in the survey?', df_sch, meta_sch, cols_retriever)

- Country
hint - Country
['REGION', 'OECD', 'SCHLTYPE', 'CNT']
content="def run(df):\n    return float(df['CNT'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 168, 'total_tokens': 185}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-8fb601f9-6281-4b89-bcff-f9615c6ab6d9-0' usage_metadata={'input_tokens': 168, 'output_tokens': 17, 'total_tokens': 185}
def run(df):
    return float(df['CNT'].dropna().nunique())


{'question': 'how many distinct countries were involved in the survey?',
 'result': 80.0,
 'hint_cols': ['REGION', 'OECD', 'SCHLTYPE', 'CNT']}

In [13]:
utils.pipeline(llm, 'how many OECD member countries are in the survey?', df_sch, meta_sch, cols_retriever)

- Country Name
- OECD Member Status
hint - Country Name
['CNT', 'REGION', 'OECD', 'NatCen']
hint - OECD Member Status
['OECD', 'CNTSCHID', 'SC053D11TA', 'PROPAT7']
content="def run(df):\n    return float(df['OECD'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 301, 'total_tokens': 319}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-2e81ab56-3a04-4313-be76-8043374e5612-0' usage_metadata={'input_tokens': 301, 'output_tokens': 18, 'total_tokens': 319}
def run(df):
    return float(df['OECD'].dropna().nunique())


{'question': 'how many OECD member countries are in the survey?',
 'result': 2.0,
 'hint_cols': ['CNT',
  'REGION',
  'OECD',
  'NatCen',
  'OECD',
  'CNTSCHID',
  'SC053D11TA',
  'PROPAT7']}

In [14]:
utils.pipeline(llm, 'how many countries that are OECD memebers are in the survey?', df_sch, meta_sch, cols_retriever)

- Country Name
- OECD Member (Yes/No)
hint - Country Name
['CNT', 'REGION', 'OECD', 'NatCen']
hint - OECD Member (Yes/No)
['OECD', 'SC053D11TA', 'SC195Q01JA', 'SC034Q01NA']
content="def run(df):\n    return float(df['OECD'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 328, 'total_tokens': 346}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-48b837ec-0dd9-4ea2-b8c6-ccc0e1840277-0' usage_metadata={'input_tokens': 328, 'output_tokens': 18, 'total_tokens': 346}
def run(df):
    return float(df['OECD'].dropna().nunique())


{'question': 'how many countries that are OECD memebers are in the survey?',
 'result': 2.0,
 'hint_cols': ['CNT',
  'REGION',
  'OECD',
  'NatCen',
  'OECD',
  'SC053D11TA',
  'SC195Q01JA',
  'SC034Q01NA']}

In [15]:
 utils.pipeline(llm, 'how many countries that are OECD memeber were involved in the survey?', df_sch, meta_sch, cols_retriever)

- Country
- OECD Member (Yes/No)
hint - Country
['REGION', 'OECD', 'SCHLTYPE', 'CNT']
hint - OECD Member (Yes/No)
['OECD', 'SC053D11TA', 'SC195Q01JA', 'SC169Q01JA']
content="def run(df):\n    return float(df['OECD'].dropna().nunique())" response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 326, 'total_tokens': 344}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-bf4f8cea-c2a9-4224-8afc-7b1569cdf26b-0' usage_metadata={'input_tokens': 326, 'output_tokens': 18, 'total_tokens': 344}
def run(df):
    return float(df['OECD'].dropna().nunique())


{'question': 'how many countries that are OECD memeber were involved in the survey?',
 'result': 2.0,
 'hint_cols': ['REGION',
  'OECD',
  'SCHLTYPE',
  'CNT',
  'OECD',
  'SC053D11TA',
  'SC195Q01JA',
  'SC169Q01JA']}

In [None]:
question = 'how many countries that are OECD memeber were involved in the survey?'

data_explanation = utils.docs2explanation(rel_col_docs, meta_sch)
columns = [i.metadata['original_col_name'] for i in rel_col_docs]
prompt2 = f"Given a dataframe with the following columns {columns}, column meaning: {data_explanation}, can you generate a python code, without sample data, which can answer the following question? the code must contain only one function called 'run', that returns an exact number. \nQuestion: {question}"
res = llm.invoke(prompt2)
code = res.content.replace('```python','').replace('```','')
code