In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sqlalchemy import create_engine
from utils import db_connect

In [None]:
# To avoid data type conversion errors, I had to force every data type in all of our five data sources to text.
# I was able to create a view from them called combined data, which can now be ingested into a dataframe without error
# columns will need to be updated to correct data types for further work.
engine = db_connect()

main_df = pd.read_sql('SELECT * FROM combined_data', con=engine)
main_df.head(1)

In [None]:
main_df.info()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
main_df.head(1)

In [None]:
# LOOK THROUGH ALL TODO'S PRIOR TO DROPPING ANY COLS
redundant_cols = ['X', 'Y', 'OBJECTID','ST_LEAID', 'LEA_NAME', 'LSTREET1', 'LSTREET2', 'LSTREET3', 
                  'LZIP4', 'PHONE', 'CHARTER_TEXT', 'MAGNET_TEXT', 'AMALM', 'AMALF', 'ASALM', 'ASALF', 
                  'BLALM', 'BLALF', 'HPALM', 'HPALF', 'HIALM', 'HIALF', 'TRALM', 'TRALF', 
                  'WHALM', 'WHALF', '', '']

In [None]:
main_df.shape

In [None]:
main_df['OBJECTID'].nunique()

In [None]:
# I think this will end up being confusing as these have the same object id, 
# but are different schools across survey years, so dropping.
main_df[main_df['OBJECTID'] == '1']

In [None]:
# LEAID and ST_LEAID seem to point to the same subsets, so as an identifer, they are both not needed. 

In [None]:
# Exclude all charter schools (or missing) from our set. We are focused on publically funded schools

In [None]:
main_df[main_df['CHARTER_TEXT'] == 'Yes'].shape

In [None]:
main_df[main_df['CHARTER_TEXT'] == 'Missing'].shape

In [None]:
# TODO: ONCE DATAFRAME IS CLEANED WITH CORRECT DATATYPES AND REDUNDANT COLUMNS DROPPED: 
    # Drop all tables and views in the DB, replace with cleaned data. Store raw original files locally

In [None]:
main_df.head(1)

In [None]:
main_df['MAGNET_TEXT'].nunique()

In [None]:
main_df['MAGNET_TEXT'].unique()

In [None]:
main_df[main_df['MAGNET_TEXT'] == 'Yes'].shape

In [None]:
main_df['MAGNET_TEXT'].isna().sum()

In [None]:
main_df[main_df['MAGNET_TEXT'] == 'M'].shape

In [None]:
main_df[main_df['MAGNET_TEXT'] == 'Missing'].shape

In [None]:
main_df[main_df['MAGNET_TEXT'] == 'Not applicable'].shape

In [None]:
main_df[main_df['MAGNET_TEXT'] == 'Not Applicable'].shape

In [None]:
# TODO convert all text NaN to -> actually NaN

In [None]:
main_df['VIRTUAL'].nunique()

In [None]:
main_df['VIRTUAL'].unique()

In [None]:
# TODO revisit below
main_df[main_df['VIRTUAL'].isin(['Not Virtual', 'Not a virtual school', 'N', 'Not Applicable'])].shape

In [None]:
main_df['SY_STATUS_TEXT'].nunique()

In [None]:
main_df['SY_STATUS_TEXT'].unique()

In [None]:
main_df[main_df['SY_STATUS_TEXT'] == 'Currently operational '].shape

In [None]:
# A function that tells us whether or not a single NCESSCH (ID for a specific school) was under the 
# status of 'Currently operational ' in the SY_STATUS_TEXT column every single survey year in our dataset.
def mark_always_operational(main_df):
    
    total_years = main_df['SURVYEAR'].nunique()
    
    main_df['SY_STATUS_TEXT'] = main_df['SY_STATUS_TEXT'].str.strip()
    
    operational_counts = (
        main_df[main_df['SY_STATUS_TEXT'] == 'Currently operational']
        .groupby('NCESSCH')['SURVYEAR']
        .nunique()
    )
    
    always_operational_schools = operational_counts[operational_counts == total_years].index
    
    main_df['concurrently_operational'] = main_df['NCESSCH'].isin(always_operational_schools)
    
    return main_df

In [None]:
mark_always_operational(main_df=main_df)

In [None]:
main_df['concurrently_operational'].unique()

In [None]:
main_df[main_df['concurrently_operational'] == 1][213943:223943].head(5)

In [None]:
main_df[main_df['NCESSCH'] == '391000205134']

In [None]:
main_df[main_df['NCESSCH'] == '391000301634']

In [None]:
main_df[main_df['concurrently_operational'] == 0].head(5)

In [None]:
main_df[main_df['NCESSCH'] == '10000901403']

In [None]:
main_df[main_df['NCESSCH'] == '40181003073']

In [None]:
main_df.head(1)

In [None]:
# TODO Drop 'concurrently_operational' = False schools to preserve the integrity of data.

main_df = main_df[main_df['concurrently_operational'] != 'False']


In [None]:
# By dropping Charter schools from our data set, we make the CHARTER_TEXT feature redundant, 
# and can therefor be dropped.

# TODO: Drop all rows for Charter schools, then drop the feature.
# ['No', 'Yes', 'N', 'Not applicable', 'Missing', 'Not Applicable']
main_df = main_df[main_df['CHARTER_TEXT'].isin(['No', 'Not applicable', 'Not Applicable'])]

In [None]:
# drop the charter text feature
main_df.drop(columns='CHARTER_TEXT', inplace=True)

In [None]:
main_df.head(1)

In [None]:
# TODO Drop all Magnet school program school from data set, drop MAGNET_TEXT feature
main_df['MAGNET_TEXT'].unique()

In [None]:
main_df.shape

In [None]:
main_df[main_df["MAGNET_TEXT"].isna()].shape

In [None]:
main_df = main_df[main_df['MAGNET_TEXT'].isin(['No', 'Not applicable', 'Not Applicable', 'N'])]

In [None]:
main_df.shape

In [None]:
main_df.columns

In [None]:
# TODO finish cleaning dataframe, drop columns, etc.
# TODO decide what to do with NaNs
# TODO Save raw csvs locally
# TODO purge DB
# TODO Move cleaned dataframe into DB