In [1]:
# import pyodide_kernel
# pyodide_kernel.__version__


In [2]:
%pip install -q openpyxl pandasai plotly


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Consolidating 2021-2022 TAPR databases

### Databases:

- Staff
- Student
- District Reference
- Accountability Summary

In [None]:
import pandas as pd

import numpy as np
# import pygwalker as pyg
from string import ascii_uppercase as letters
from IPython.display import display

from pandasai import PandasAI

# Instantiate a LLM
from pandasai.llm.openai import OpenAI
llm = OpenAI(api_token="XXXX")

SCHOOL_YEAR = "2021-2022"

# Setting up the paths of the TAPR data
district_reference_path = f"{SCHOOL_YEAR}/DREF.csv"
district_reference_reference = f"{SCHOOL_YEAR}/DREF_reference.xlsx"

df = pd.read_csv(district_reference_path)

def gen_header_column_legend(path):
    # Load spreadsheet
    xl = pd.ExcelFile(path)

    # Load a sheet into a DataFrame
    df = xl.parse(xl.sheet_names[0])

    # Make sure the required columns exist
    if 'NAME' not in df.columns or 'LABEL' not in df.columns:
        raise ValueError("The provided Excel file doesn't have the required 'NAME' or 'LABEL' columns.")
        
    # Convert DataFrame to dictionary
    result_dict = df.set_index('NAME')['LABEL'].to_dict()
    
    # Remove non-breaking spaces from keys and values
    result_dict = {k.rstrip('\xa0'): v.rstrip('\xa0') for k, v in result_dict.items()}

    return result_dict

def pad_district_number(dist_num):
    dist_num = str(dist_num)
    
    dist_num = dist_num.replace("'", "").replace("`", "")
    
    if len(dist_num) != 6:
        dist_num = ("0" * (6 - len(dist_num))) + dist_num
        
    return "'" + dist_num

district_reference_dict = gen_header_column_legend(district_reference_reference)

df = df.rename(columns=district_reference_dict)

# print(district_reference_dict)

df["District Number"] = df["District Number"].apply(pad_district_number)

staff_path = f"{SCHOOL_YEAR}/DSTAF.csv"
staff_reference = f"{SCHOOL_YEAR}/DSTAF_reference.xlsx"

student_path = f"{SCHOOL_YEAR}/DSTUD.csv"
student_reference = f"{SCHOOL_YEAR}/DSTUD_reference.xlsx"

accountability_summary_path = f"{SCHOOL_YEAR}/DISTRATE.csv"
accountability_summary_reference = f"{SCHOOL_YEAR}/DISTRATE_reference.xlsx"

tapr_reports = [
    (staff_path, staff_reference), 
    (student_path, student_reference), 
    (accountability_summary_path, accountability_summary_reference)
]

# print(list(df.columns))

for report in tapr_reports:
    report_path = report[0]
    reference_path = report[1]
    
    df_local = pd.read_csv(report_path)
    
    reference_dict = gen_header_column_legend(reference_path)
    
    df_local = df_local.rename(columns=reference_dict)
    
    df_local["District Number"] = df_local["District Number"].apply(pad_district_number)

    # Get the names of the columns to keep in df_local. This is the set of columns in df_local
    # that are not in df.
    # cols_to_keep = df_local.columns.difference(df.columns)

    # Only keep those columns in df_local, along with the column we're merging on.
    # df_local = df_local[np.append('District Number', cols_to_keep)]
        
    # print(list(df_local.columns))
        
    df = pd.merge(df, df_local, how="left", on="District Number")

df


In [1]:
df.to_csv("2021-2022 TAPR_Merged Selected Reports.csv", index=False)

NameError: name 'df' is not defined

Shareable Link to Consolidated 2021-2022 TAPR Reports: https://adpena.github.io/improved-train/lab?path=data%2FTAPR%2F2021-2022+TAPR_Merged+Selected+Reports.csv
  

In [None]:
# gwalker = pyg.walk(df)

In [None]:
import numpy as np

# Identify numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate normalized means for charter schools and public school districts
charter_schools = df[df['District 2022 Flag - Charter Operator (Y/N)_x'] == 'Y'][numeric_columns]
public_schools = df[df['District 2022 Flag - Charter Operator (Y/N)_x'] == 'N'][numeric_columns]

# Normalizing the data (this could take a while because of the large number of columns)
charter_schools_normalized = (charter_schools - charter_schools.min()) / (charter_schools.max() - charter_schools.min())
public_schools_normalized = (public_schools - public_schools.min()) / (public_schools.max() - public_schools.min())

# Calculate means
charter_schools_means = charter_schools_normalized.mean()
public_schools_means = public_schools_normalized.mean()

# Combine into one dataframe
mean_comparison = pd.DataFrame({
    'Charter Schools': charter_schools_means,
    'Public Schools': public_schools_means
}).dropna()

# Sort by the difference between the two types of schools
mean_comparison['Difference'] = mean_comparison['Charter Schools'] - mean_comparison['Public Schools']
mean_comparison = mean_comparison.sort_values(by='Difference', ascending=False)

# Display the first few rows of the comparison
mean_comparison.head()


In [None]:
import plotly.graph_objects as go

# Create the interactive chart
fig = go.Figure(data=[
    go.Bar(name='Charter Schools', x=mean_comparison.index, y=mean_comparison['Charter Schools']),
    go.Bar(name='Public Schools', x=mean_comparison.index, y=mean_comparison['Public Schools'])
])

# Change the bar mode
fig.update_layout(barmode='group')

# Show the figure
fig.show()

from plotly.offline import plot

plot(fig, filename='comparison_chart.html')
