In [1]:
import pandas as pd
import pyreadstat


In [2]:
df_sch, meta_sch = pyreadstat.read_sav('data/PISA2022_SCH_QQQ.SAV')

In [3]:
from langchain.chains.query_constructor.base import AttributeInfo

def meta2attrinfo(spss_df, spss_meta):
    metadata = []
    for col in spss_df.columns:        
        if col in spss_meta.variable_value_labels:                
            values = spss_meta.variable_value_labels[col].values()
            values_text = ". One of ['" + ','.join(values) + "']"
        else:
            values_text = ''
        metadata.append(
            AttributeInfo(
                name=col,  
                description=spss_meta.column_names_to_labels[col] + values_text,
                type=spss_meta.readstat_variable_types[col]))
    return metadata

attrinfo_sch=meta2attrinfo(df_sch, meta_sch)
#attrinfo_stu=meta2attrinfo(df_stu, meta_stu)


In [4]:
import os

with open('data/openai.api.key', 'r') as filek: 
    openai_key = filek.read()
os.environ["OPENAI_API_KEY"] =  openai_key 

In [13]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

def meta2docs(spss_df, spss_meta):
    docs = []
    for col in  spss_df.columns:
        if col in spss_meta.variable_value_labels:
            docs.append(
                Document(
                    page_content=spss_meta.column_names_to_labels[col],
                    metadata={"year": 2022, "oroginal_col_name": col},
                ),
            )
    return docs
    
cols = meta2docs(df_sch, meta_sch)

cols_vectorstore = Chroma.from_documents(cols, OpenAIEmbeddings())

In [14]:
cols_retriever = cols_vectorstore.as_retriever()

In [15]:
#BAD, school siye is not included
cols_retriever.invoke('What is average teacher-student ration in mid size schools?')

[Document(page_content='Student-teacher ratio', metadata={'year': 2022}),
 Document(page_content='Student-teacher ratio', metadata={'oroginal_col_name': 'STRATIO', 'year': 2022}),
 Document(page_content='Student-mathematics teacher ratio', metadata={'year': 2022}),
 Document(page_content='Student-mathematics teacher ratio', metadata={'oroginal_col_name': 'SMRATIO', 'year': 2022})]

In [8]:
from langchain_openai import ChatOpenAI
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

document_content_description = "Description of a data table column"
llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    cols_vectorstore,
    document_content_description,
    attrinfo_sch,
)

In [9]:
#pip install lark

In [10]:
retriever.invoke('What columns needed to answer this question: What is average teacher-student ration in mid size schools?')

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 53276 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [11]:
llm.invoke('Please list the tipical database column fields, that needed to answer the following wuestion: What is average teacher-student ration in mid size schools?')

AIMessage(content='1. School ID\n2. School Name\n3. Number of Teachers\n4. Number of Students\n5. School Size\n6. Teacher-Student Ratio', response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 37, 'total_tokens': 70}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-c96463b4-3fd5-438f-ac25-931842763a53-0', usage_metadata={'input_tokens': 37, 'output_tokens': 33, 'total_tokens': 70})

In [12]:
cols_retriever.invoke('1. School ID\n2. School Name\n3. Number of Teachers\n4. Number of Students\n5. School Size\n6. Teacher-Student Ratio')

[Document(page_content='Student-teacher ratio', metadata={'year': 2022}),
 Document(page_content='School size (Sum)', metadata={'year': 2022}),
 Document(page_content='Student-mathematics teacher ratio', metadata={'year': 2022}),
 Document(page_content='Total number of all teachers at school (Sum)', metadata={'year': 2022})]