In [1]:
import pandas as pd
import pyreadstat


In [2]:
df_sch, meta_sch = pyreadstat.read_sav('data/PISA2022_SCH_QQQ.SAV')

In [3]:
from langchain.chains.query_constructor.base import AttributeInfo

def meta2attrinfo(spss_df, spss_meta):
    metadata = []
    for col in spss_df.columns:        
        if col in spss_meta.variable_value_labels:                
            values = spss_meta.variable_value_labels[col].values()
            values_text = ". One of ['" + ','.join(values) + "']"
        else:
            values_text = ''
        metadata.append(
            AttributeInfo(
                name=col,  
                description=spss_meta.column_names_to_labels[col] + values_text,
                type=spss_meta.readstat_variable_types[col]))
    return metadata

attrinfo_sch=meta2attrinfo(df_sch, meta_sch)
#attrinfo_stu=meta2attrinfo(df_stu, meta_stu)


In [4]:
import os

os.environ["OPENAI_API_KEY"] = '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'

In [5]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

def meta2docs(spss_df, spss_meta):
    docs = []
    for col in  spss_df.columns:
        if col in spss_meta.variable_value_labels:
            docs.append(
                Document(
                    page_content=spss_meta.column_names_to_labels[col],
                    metadata={"year": 2022, "original_col_name": col},
                ),
            )
    return docs
    
cols = meta2docs(df_sch, meta_sch)

cols_vectorstore = Chroma.from_documents(cols, OpenAIEmbeddings())

In [6]:
cols_retriever = cols_vectorstore.as_retriever()

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

In [25]:
question = 'What is average teacher-student ration in mid size schools?'
prompt1 = f"Please list the typical database column fields, that needed to answer the following question: {question}"
relevant_col_list_msg = llm.invoke(prompt1)
relevant_col_list = relevant_col_list_msg.content
relevant_col_list

'1. School ID\n2. School Name\n3. Number of Teachers\n4. Number of Students\n5. School Size\n6. Teacher-Student Ratio'

In [9]:
rel_col_docs = cols_retriever.invoke(relevant_col_list)
rel_col_docs

[Document(page_content='Student-teacher ratio', metadata={'original_col_name': 'STRATIO', 'year': 2022}),
 Document(page_content='School size (Sum)', metadata={'original_col_name': 'SCHSIZE', 'year': 2022}),
 Document(page_content='Student-mathematics teacher ratio', metadata={'original_col_name': 'SMRATIO', 'year': 2022}),
 Document(page_content='Total number of all teachers at school (Sum)', metadata={'original_col_name': 'TOTAT', 'year': 2022})]

In [23]:
def docs2structure(docs):
    t = '[{\n'
    for doc in rel_col_docs:
        t = t + '"' + doc.metadata['original_col_name'] + '", #' + doc.page_content + '\n'
    t = t + '}]'
    return t
print(docs2structure(rel_col_docs))



[{
"STRATIO", #Student-teacher ratio
"SCHSIZE", #School size (Sum)
"SMRATIO", #Student-mathematics teacher ratio
"TOTAT", #Total number of all teachers at school (Sum)
}]


In [27]:
question = "What is average teacher-student ration in mid size schools?"
data_structure = docs2structure(rel_col_docs)
prompt2 = f"Given the following formatted list of data {data_structure}, can you generate a python code, which can answer the following question? the code must return an exact number. \nQuestion: {question}"
res = llm.invoke(prompt2)

In [29]:
print(res.content)

Here is a Python code that calculates the average teacher-student ratio in mid-size schools:

```python
data = [{
    "STRATIO": 20,
    "SCHSIZE": 500,
    "SMRATIO": 25,
    "TOTAT": 30
},
{
    "STRATIO": 15,
    "SCHSIZE": 400,
    "SMRATIO": 20,
    "TOTAT": 25
},
{
    "STRATIO": 18,
    "SCHSIZE": 450,
    "SMRATIO": 22,
    "TOTAT": 27
}
]

mid_size_schools = [school for school in data if school["SCHSIZE"] >= 400 and school["SCHSIZE"] <= 500]

total_teacher_student_ratio = sum(school["TOTAT"] / school["SCHSIZE"] for school in mid_size_schools)
average_teacher_student_ratio = total_teacher_student_ratio / len(mid_size_schools)

print(average_teacher_student_ratio)
```

This code will calculate the average teacher-student ratio in mid-size schools based on the provided data.


In [42]:
rel_cols = [c.metadata['original_col_name'] for c in rel_col_docs]
filtered_df = df_sch[rel_cols]
data = filtered_df.to_dict('records')

In [49]:
mid_size_schools = [school for school in data if school["SCHSIZE"] >= 400 and school["SCHSIZE"] <= 500]

total_teacher_student_ratio = sum([school["TOTAT"] / school["SCHSIZE"] for school in mid_size_schools])
average_teacher_student_ratio = total_teacher_student_ratio / len(mid_size_schools)

print(average_teacher_student_ratio)

nan


In [50]:
total_teacher_student_ratio

nan

In [53]:
def docs2structure(docs):
    t = '[\n'
    for doc in rel_col_docs:
        t = t + '"' + doc.metadata['original_col_name'] + '", #' + doc.page_content + '\n'
    t = t + ']'
    return t
print(docs2structure(rel_col_docs))


[
"STRATIO", #Student-teacher ratio
"SCHSIZE", #School size (Sum)
"SMRATIO", #Student-mathematics teacher ratio
"TOTAT", #Total number of all teachers at school (Sum)
]


In [57]:
question = "What is average teacher-student ration in mid size schools?"
data_structure = docs2structure(rel_col_docs)
prompt2 = f"Given a dataframe with the following columns {data_structure}, and with name 'data_df', can you generate a python code, without sample data, which can answer the following question? the code must return an exact number. \nQuestion: {question}"
res = llm.invoke(prompt2)

In [58]:
print(res.content)

You can use the following Python code to calculate the average teacher-student ratio in mid-size schools:

```python
# Filter the dataframe to include only mid-size schools
mid_size_schools = data_df[data_df['SCHSIZE'] == 'mid']

# Calculate the average teacher-student ratio in mid-size schools
average_teacher_student_ratio = mid_size_schools['TOTAT'].sum() / mid_size_schools['SCHSIZE'].sum()

print(average_teacher_student_ratio)
```

This code first filters the dataframe to include only mid-size schools based on the 'SCHSIZE' column. Then, it calculates the average teacher-student ratio by summing the total number of teachers ('TOTAT') and dividing it by the school size ('SCHSIZE'). Finally, it prints the average teacher-student ratio for mid-size schools.


In [56]:
df = filtered_df
# Filter mid-size schools
mid_size_schools = df[(df["SCHSIZE"] >= 250) & (df["SCHSIZE"] <= 350)]

# Calculate average teacher-student ratio in mid-size schools
average_teacher_student_ratio = mid_size_schools["TOTAT"].sum() / mid_size_schools["SCHSIZE"].sum()

print(average_teacher_student_ratio)

0.09580842682373887


In [60]:
data_df = filtered_df

# Filter the dataframe to include only mid-size schools
mid_size_schools = data_df[data_df['SCHSIZE'] == 'mid']

# Calculate the average teacher-student ratio in mid-size schools
average_teacher_student_ratio = mid_size_schools['TOTAT'].sum() / mid_size_schools['SCHSIZE'].sum()

print(average_teacher_student_ratio)

nan


  average_teacher_student_ratio = mid_size_schools['TOTAT'].sum() / mid_size_schools['SCHSIZE'].sum()


In [61]:
mid_size_schools

Unnamed: 0,STRATIO,SCHSIZE,SMRATIO,TOTAT
