In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine
from utils import db_connect

pd.set_option('display.max_columns', None)

engine = db_connect()

# bring in the data
main_df = pd.read_sql('SELECT * FROM combined_data', con=engine)

Connection successful


In [2]:
# function to mark whether a school was operational all five years of data
# creates our 'Currently operational' feature
def mark_always_operational(main_df):
    
    total_years = main_df['SURVYEAR'].nunique()
    
    main_df['SY_STATUS_TEXT'] = main_df['SY_STATUS_TEXT'].str.strip()
    
    operational_counts = (
        main_df[main_df['SY_STATUS_TEXT'] == 'Currently operational']
        .groupby('NCESSCH')['SURVYEAR']
        .nunique()
    )
    
    always_operational_schools = operational_counts[operational_counts == total_years].index
    
    main_df['concurrently_operational'] = main_df['NCESSCH'].isin(always_operational_schools)
    
    return main_df

In [3]:
# apply the function
mark_always_operational(main_df=main_df)

Unnamed: 0,X,Y,OBJECTID,NCESSCH,SURVYEAR,STABR,LEAID,ST_LEAID,LEA_NAME,SCH_NAME,...,HI,TRALM,TRALF,TR,WHALM,WHALF,WH,LATCOD,LONCOD,concurrently_operational
0,-111.68205,33.3086670000001,2901,40088403264,2020-2021,AZ,400884,AZ-92327,ASU Preparatory Academy (92327),ASU Preparatory Academy-Polytechnic Middle School,...,155,14,12,26,71,55,126,33.308667,-111.682045,True
1,-112.40877,33.4552610000001,2902,40088503410,2020-2021,AZ,400885,AZ-91326,Incito Schools (91326),Incito Schools,...,125,7,5,12,55,73,128,33.455261,-112.408765,True
2,-111.88974,33.541458,2903,40088603407,2020-2021,AZ,400886,AZ-92314,Archway Classical Academy Cicero (92314),Great Hearts Academies - Archway Cicero,...,82,19,17,36,180,190,370,33.541458,-111.889741,True
3,-112.08183,33.43949,2904,40088703415,2020-2021,AZ,400887,AZ-91948,Vista College Preparatory Inc. (91948),Vista College Preparatory,...,299,2,3,5,7,6,13,33.43949,-112.081829,True
4,-112.15654,33.466285,2905,40088703561,2020-2021,AZ,400887,AZ-91948,Vista College Preparatory Inc. (91948),Vista College Prep - Maryvale,...,333,0,2,2,3,2,5,33.466285,-112.156535,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504833,-76.3945109998101,37.0199299997691,100647,510180003048,2017-2018,VA,5101800,VA-112,HAMPTON CITY PBLC SCHS,Campus at Lee,...,,,,,,,,37.01993,-76.394511,False
504834,-77.5677569998528,38.9262839996828,100648,510225003046,2017-2018,VA,5102250,VA-053,LOUDOUN CO PBLC SCHS,Goshen Post Elementary,...,,,,,,,,38.926284,-77.567757,False
504835,-77.5896399996999,38.9275770001582,100649,510225003059,2017-2018,VA,5102250,VA-053,LOUDOUN CO PBLC SCHS,Willard Intermediate School,...,,,,,,,,38.927577,-77.58964,False
504836,-77.4676410002145,38.7472899997469,100650,510236003047,2017-2018,VA,5102360,VA-143,MANASSAS CITY PBLC SCHS,Governor's STEM Academy at Osburn High School,...,,,,,,,,38.74729,-77.467641,False


In [4]:
# drop records that were not fully operational across all five years
main_df = main_df[main_df['concurrently_operational'] != 'False']

In [5]:
# strip whitespace
for col in main_df.select_dtypes(include=["string"]).columns:
    main_df[col] = main_df[col].str.strip()

In [6]:
# begin defining data type conversion processes
# Change columns to floats
float_cols = ["X", "Y", "LATCOD", "LONCOD", "FTE", "STUTERATIO"]

# change columns to int
int_cols = [
    "OBJECTID", "GSLO", "GSHI",
    "TOTFRL", "FRELCH", "REDLCH", "DIRECTCERT",
    "PK", "KG", "G01", "G02", "G03", "G04", "G05", "G06",
    "G07", "G08", "G09", "G10", "G11", "G12", "G13",
    "UG", "AE",
    "TOTMENROL", "TOTFENROL", "TOTAL", "MEMBER",
    "AMALM", "AMALF", "AM",
    "ASALM", "ASALF", "AS",
    "BLALM", "BLALF", "BL",
    "HPALM", "HPALF", "HP",
    "HIALM", "HIALF", "HI",
    "TRALM", "TRALF", "TR",
    "WHALM", "WHALF", "WH"
]

# Change columns to strings
string_cols = [
    "NCESSCH", "SURVYEAR", "STABR", "LEAID", "ST_LEAID",
    "LEA_NAME", "SCH_NAME",
    "LSTREET1", "LSTREET2", "LCITY", "LSTATE",
    "LZIP", "LZIP4", "PHONE",
    "VIRTUAL", "SCHOOL_LEVEL", "SCHOOL_TYPE_TEXT",
    "STATUS", "SY_STATUS_TEXT", "ULOCALE", "NMCNTY",
    "CHARTER_TEXT", "LSTREET3", "TITLEI", "STITLEI", "MAGNET_TEXT"
]

In [7]:
# function to clean NCES error codes
def clean_nces_error_codes(main_df, cols):
    error_values = [
    "M", "N", "-1", "-2", "-9",
    "Missing", "Not applicable", "Not Applicable",
    -1, -2, -9
]
    main_df[cols] = main_df[cols].replace(error_values, np.nan)
    return main_df

In [18]:
# clean ALL columns 
cols = float_cols + int_cols + string_cols
main_df = clean_nces_error_codes(main_df, cols)

In [9]:
# convert floats safely
for col in float_cols:
    main_df[col] = pd.to_numeric(main_df[col], errors="coerce")

In [10]:
# convert ints safely
for col in int_cols:
    main_df[col] = pd.to_numeric(main_df[col], errors="coerce").astype("Int64")

In [11]:
# convert strings
for col in string_cols:
    main_df[col] = main_df[col].astype("string")

In [12]:
# round coordinates
main_df["LATCOD"] = main_df["LATCOD"].round(4)
main_df["LONCOD"] = main_df["LONCOD"].round(4)

In [13]:
# extract start Year - convert to int for sorting
main_df['SURVYEAR'] = main_df['SURVYEAR'].str[:4].astype(int)

In [15]:
# drop all non-concurrently operational
main_df = main_df[main_df['concurrently_operational'] != 'False']

In [20]:
# removing charter schools
main_df = main_df[main_df['CHARTER_TEXT'].isin(['No'])]

In [22]:
# drop the charter text feature
main_df.drop(columns='CHARTER_TEXT', inplace=True)

In [24]:
# removing magnet schools
main_df = main_df[main_df['MAGNET_TEXT'].isin(['No'])]

In [25]:
# drop the magnet text feature
main_df.drop(columns='MAGNET_TEXT', inplace=True)

In [32]:
# removing virtual schools
main_df = main_df[main_df['VIRTUAL'].isin(['Not Virtual', 'Not a virtual school'])]

In [33]:
# drop the virtual feature
main_df.drop(columns='VIRTUAL', inplace=True)

In [42]:
# only keeping 'regular' public schools, removing: [ 'Career and Technical School',
# 'Special education school', 'Alternative Education School',
# 'Alternative/other school', 'Vocational school']
main_df = main_df[main_df['SCHOOL_TYPE_TEXT'].isin(['Regular school'])]

In [43]:
# drop the SCHOOL_TYPE_TEXT feature
main_df.drop(columns='SCHOOL_TYPE_TEXT', inplace=True)

In [45]:
main_df.head(1)

Unnamed: 0,X,Y,OBJECTID,NCESSCH,SURVYEAR,STABR,LEAID,ST_LEAID,LEA_NAME,SCH_NAME,LSTREET1,LSTREET2,LSTREET3,LCITY,LSTATE,LZIP,LZIP4,PHONE,GSLO,GSHI,SCHOOL_LEVEL,TITLEI,STITLEI,STATUS,SY_STATUS_TEXT,ULOCALE,NMCNTY,TOTFRL,FRELCH,REDLCH,DIRECTCERT,PK,KG,G01,G02,G03,G04,G05,G06,G07,G08,G09,G10,G11,G12,G13,UG,AE,TOTFENROL,TOTMENROL,TOTAL,MEMBER,FTE,STUTERATIO,AMALM,AMALF,AM,ASALM,ASALF,AS,BLALM,BLALF,BL,HPALM,HPALF,HP,HIALM,HIALF,HI,TRALM,TRALF,TR,WHALM,WHALF,WH,LATCOD,LONCOD,concurrently_operational
2934,-104.03011,38.127589,16993,80411000568,2020,CO,804110,CO-2540,Fowler School District No. R4J,Fowler High School,600 WEST GRANT,,,FOWLER,CO,81039,1502,(719)263-4279,9,12,High,2-No,,1,Currently operational,43-Rural: Remote,Otero County,64,41,23,,,,,,,,,,,,28,27,28,30,,,,,,113,113,11.23,10.06,,,,,,,,,,,,,10,15,25,1,,1,45,42,87,38.1276,-104.0301,True


In [49]:
# define additional redundant columns
redundant_cols = ['X', 'Y', 'OBJECTID', 'ST_LEAID', 'LSTREET1', 'LSTREET2', 'LSTREET3', 
                  'LZIP4', 'PHONE', 'AMALM', 'AMALF', 'ASALM', 'ASALF', 
                  'BLALM', 'BLALF', 'HPALM', 'HPALF', 'HIALM', 'HIALF', 'TRALM', 'TRALF', 
                  'WHALM', 'WHALF', 'STABR', 'LCITY', 'LSTATE', 'LZIP', 'SCHOOL_LEVEL', 'GSLO', 'GSHI'
                  , 'TITLEI', 'STITLEI', 'STATUS', 'SY_STATUS_TEXT', 'NMCNTY', 'TOTFRL', 'FRELCH'
                  ,'REDLCH', 'DIRECTCERT', 'AE', 'TOTFENROL', 'TOTMENROL']

In [50]:
main_df = main_df.drop(columns=redundant_cols)

In [None]:
# replace na values with 0
main_df = main_df.fillna(0)

In [None]:
# ensure all records contain at minimum 5 years


(180247, 36)