In [21]:
import sys, subprocess
from io import StringIO
import pandas as pd
from importlib import reload
# import mdb_pandas_api

# verbose = True
# reload(mdb_pandas_api)

In [22]:
# Read mdb into a dictionary of pandas tables
def mdb_to_pandas(database_path):
    subprocess.call(["mdb-schema", database_path, "mysql"])
    # Get the list of table names with "mdb-tables"
    table_names = subprocess.Popen(["mdb-tables", "-1", database_path],
                                   stdout=subprocess.PIPE).communicate()[0]
    tables = table_names.splitlines()
    sys.stdout.flush()

    # Dump each table as a stringio using "mdb-export",
    out_tables = {}
    for rtable in tables:
        table = rtable.decode()
        if verbose: print('running table:',table)
        if table != '':
            if verbose: print("Dumping " + table)
            contents = subprocess.Popen(["mdb-export", database_path, table],
                                        stdout=subprocess.PIPE).communicate()[0]
            temp_io = StringIO(contents.decode())
            print(table, temp_io)
            out_tables[table] = pd.read_csv(temp_io)
    return out_tables

# %% Merge staff qualification tables for 2015-2019
def clean_2015_2017_data(df, year):
    df.rename({'SCHOOL_NAME': 'ENTITY_NAME'}, axis='columns', inplace=True)
    df.rename({'NUM_FEWER_3YRS_EXP': 'NUM_TEACH_INEXP'}, axis='columns', inplace=True)
    df.rename({'NUM_TEACH_OUT_CERT': 'NUM_OUT_CERT'}, axis='columns', inplace=True)
    df = df[['ENTITY_CD', 'NUM_TEACH', 'NUM_TEACH_INEXP', 'NUM_OUT_CERT']]
    df = df.groupby(['ENTITY_CD']).sum()
    df['YEAR'] = year
    df['PER_TEACH_INEXP'] = 100 * df['NUM_TEACH_INEXP'] / df['NUM_TEACH']
    df['PER_OUT_CERT'] = 100 * df['NUM_OUT_CERT'] / df['NUM_TEACH']
    df.update(df[['PER_TEACH_INEXP','PER_OUT_CERT']].fillna(0))
    df = df.round(decimals=2)
    return df

def clean_2018_2019_data(df, year):
    df = df[['ENTITY_CD', 'NUM_TEACH', 'NUM_TEACH_INEXP', 'NUM_TEACH_OC', 'NUM_OUT_CERT']]
    df = df.groupby(['ENTITY_CD']).sum()
    df['YEAR'] = year
    df['PER_TEACH_INEXP'] = 100 * df['NUM_TEACH_INEXP'] / df['NUM_TEACH']
    df['PER_OUT_CERT'] = 100 * df['NUM_OUT_CERT'] / df['NUM_TEACH_OC']
    df.update(df[['PER_TEACH_INEXP','PER_OUT_CERT']].fillna(0))
    df.drop(['NUM_TEACH_OC'], axis=1, inplace=True)
    df = df.round(decimals=2)
    return df


## Grad Rate Tables Merge
This code appears to take all the graduation rate tables and merge them into a single df with all the years.  Apparently, we are using all uppercase names for all columns for some reason.

#### Important Fields
* YEAR = The year the graduation data came from
* AGGREGATION_INDEX (/NAME / TYPE /CODE) - The level of aggregation for the data set (e.g. state, county, etc.)
* NRC_CODE = This is the Needs to Resource 

In [26]:
%%time
# Merge graduation rate tables for 2015-2019
filepath = '../Data/df_grad_rate_15.pkl'
df_grad_rate_15 = pd.read_pickle(filepath)
df_grad_rate_15['YEAR'] = 2015

# print(df_grad_rate_15.columns)
# print('--------------------------------')

filepath = '../Data/df_grad_rate_16.pkl'
df_grad_rate_16 = pd.read_pickle(filepath)
df_grad_rate_16['YEAR'] = 2016

# print(df_grad_rate_16.columns)
# print('--------------------------------')

filepath = '../Data/df_grad_rate_17.pkl'
df_grad_rate_17 = pd.read_pickle(filepath)
df_grad_rate_17['YEAR'] = 2017

# print(df_grad_rate_17.columns)
# print('--------------------------------')

filepath = '../Data/df_grad_rate_18.pkl'
df_grad_rate_18 = pd.read_pickle(filepath)
df_grad_rate_18['YEAR'] = 2018

# print(df_grad_rate_18.columns)
# print('--------------------------------')

filepath = '../Data/df_grad_rate_19.pkl'
df_grad_rate_19 = pd.read_pickle(filepath)
df_grad_rate_19.columns = df_grad_rate_19.columns.str.upper()
df_grad_rate_19['YEAR'] = 2019

# print(df_grad_rate_19.columns)
# print('--------------------------------')

dataframe_array = [df_grad_rate_15, df_grad_rate_16, df_grad_rate_17, df_grad_rate_18, \
                   df_grad_rate_19]
df_grad_rate_all = pd.concat(dataframe_array)
df_grad_rate_all.to_pickle('../df_grad_rate_all.pkl')



CPU times: user 1.53 s, sys: 542 ms, total: 2.07 s
Wall time: 2.09 s


In [25]:
df_grad_rate_all.head()

Unnamed: 0,REPORT_SCHOOL_YEAR,AGGREGATION_INDEX,AGGREGATION_TYPE,AGGREGATION_CODE,AGGREGATION_NAME,ENTITY_INACTIVE_DATE,LEA_BEDS,LEA_NAME,NRC_CODE,NRC_DESC,...,REG_ADV_PCT,NON_DIPLOMA_CREDENTIAL_CNT,NON_DIPLOMA_CREDENTIAL_PCT,STILL_ENR_CNT,STILL_ENR_PCT,GED_CNT,GED_PCT,DROPOUT_CNT,DROPOUT_PCT,YEAR
0,2014-15,0.0,Statewide,0,All Districts and Charters,,,,,,...,31%,3339,2%,7304,3%,2001,1%,24284,11%,2015
1,2014-15,0.0,Statewide,0,All Districts and Charters,,,,,,...,34%,1186,1%,3020,3%,769,1%,10329,10%,2015
2,2014-15,0.0,Statewide,0,All Districts and Charters,,,,,,...,28%,2153,2%,4284,4%,1232,1%,13955,13%,2015
3,2014-15,0.0,Statewide,0,All Districts and Charters,,,,,,...,17%,27,2%,40,4%,25,2%,231,21%,2015
4,2014-15,0.0,Statewide,0,All Districts and Charters,,,,,,...,10%,900,2%,2394,6%,476,1%,7080,17%,2015


In [None]:
filepath = '../Data/df_grad_rate_19.pkl'
df_grad_rate_19 = pd.read_pickle(filepath)

filepath = '../Data/df_staff_qualifications_19_20.pkl'
df_staff_qualifications_19_20 = pd.read_pickle(filepath)

In [19]:
# Query tables
"""
# Graduation Rate and Outcomes 2019
database_path = 'gradrate_2019/GRAD_RATE_AND_OUTCOMES_2019.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_grad_rate_19 = out_tables['GRAD_RATE_AND_OUTCOMES_2019']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_grad_rate_19.columns)
print('\ntable head = ', df_grad_rate_19.head())
print('unique nrc_desc', df_grad_rate_19['nrc_desc'].unique())
print('unique subgroup_name', df_grad_rate_19['subgroup_name'])
"""
"""
# Graduation Rate and Outcomes 2018
database_path = 'gradrate_2018/GRAD_RATE_AND_OUTCOMES_2018.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_grad_rate_18 = out_tables['GRAD_RATE_AND_OUTCOMES_2018']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_grad_rate_18.columns)
print('\ntable head = ', df_grad_rate_18.head())

# Graduation Rate and Outcomes 2017
database_path = 'gradrate_2017/GRAD_RATE_AND_OUTCOMES_2017.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_grad_rate_17 = out_tables['GRAD_RATE_AND_OUTCOMES_2017']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_grad_rate_17.columns)
print('\ntable head = ', df_grad_rate_17.head())

# Graduation Rate and Outcomes 2016
database_path = 'gradrate_2016/GRAD_RATE_AND_OUTCOMES_2016.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_grad_rate_16 = out_tables['GRAD_RATE_AND_OUTCOMES_2016']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_grad_rate_16.columns)
print('\ntable head = ', df_grad_rate_16.head())

# Graduation Rate and Outcomes 2015
database_path = 'gradrate_2015/GRAD_RATE_AND_OUTCOMES_2015.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_grad_rate_15 = out_tables['GRAD_RATE_AND_OUTCOMES_2015']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_grad_rate_15.columns)
print('\ntable head = ', df_grad_rate_15.head())


# Report Card Staff Qualifications 2019
database_path = '../Data/SRC2019_20200703.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_19_20 = out_tables['Staff Qualifications']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_19_20.columns)
print('\ntable head = ', df_staff_qualifications_19_20.head())

df_staff_qualifications_19_20.to_pickle('../Data/df_staff_qualifications_19_20.pkl')

# Report Card Staff Qualifications 2018
database_path = 'SRC2018/SRC2018_20190627.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_18_19 = out_tables['Staff Qualifications']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_18_19.columns)
print('\ntable head = ', df_staff_qualifications_18_19.head())

# Report Card Staff Qualifications 2017
database_path = 'SRC2017/SRC2017GroupIIRelease.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_17 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_17.columns)
print('\ntable head = ', df_staff_qualifications_17.head())

# Report Card Staff Qualifications 2016
database_path = 'SRC2016/SRC2016_GroupIII.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_16 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_16.columns)
print('\ntable head = ', df_staff_qualifications_16.head())

# Report Card Staff Qualifications 2015
database_path = 'SRC2015/SRC2015.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_15 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_15.columns)
print('\ntable head = ', df_staff_qualifications_15.head())
"""

"\n# Graduation Rate and Outcomes 2018\ndatabase_path = 'gradrate_2018/GRAD_RATE_AND_OUTCOMES_2018.mdb'\nout_tables = mdb_to_pandas(database_path)\ntable_names = list(out_tables.keys())\ndf_grad_rate_18 = out_tables['GRAD_RATE_AND_OUTCOMES_2018']\n\nprint('-------')\nprint('\ntable names = ', table_names)\nprint('\ntable columns = ', df_grad_rate_18.columns)\nprint('\ntable head = ', df_grad_rate_18.head())\n\n# Graduation Rate and Outcomes 2017\ndatabase_path = 'gradrate_2017/GRAD_RATE_AND_OUTCOMES_2017.mdb'\nout_tables = mdb_to_pandas(database_path)\ntable_names = list(out_tables.keys())\ndf_grad_rate_17 = out_tables['GRAD_RATE_AND_OUTCOMES_2017']\n\nprint('-------')\nprint('\ntable names = ', table_names)\nprint('\ntable columns = ', df_grad_rate_17.columns)\nprint('\ntable head = ', df_grad_rate_17.head())\n\n# Graduation Rate and Outcomes 2016\ndatabase_path = 'gradrate_2016/GRAD_RATE_AND_OUTCOMES_2016.mdb'\nout_tables = mdb_to_pandas(database_path)\ntable_names = list(out_tables.

In [17]:
# Report Card Staff Qualifications 2019
database_path = '../Data/SRC2019_20200703.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_19_20 = out_tables['Staff Qualifications']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_19_20.columns)
print('\ntable head = ', df_staff_qualifications_19_20.head())

df_staff_qualifications_19_20.to_pickle('../Data/df_staff_qualifications_19_20.pkl')

KeyError: 'Staff Qualifications'

In [16]:
# Report Card Staff Qualifications 2018
database_path = 'SRC2018/SRC2018_20190627.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_18_19 = out_tables['Staff Qualifications']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_18_19.columns)
print('\ntable head = ', df_staff_qualifications_18_19.head())

NameError: name 'df_staff_qualifications' is not defined

In [None]:
# Report Card Staff Qualifications 2017
database_path = 'SRC2017/SRC2017GroupIIRelease.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_17 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_17.columns)
print('\ntable head = ', df_staff_qualifications_17.head())

In [None]:
# Report Card Staff Qualifications 2016
database_path = 'SRC2016/SRC2016_GroupIII.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_16 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_16.columns)
print('\ntable head = ', df_staff_qualifications_16.head())

In [18]:
# Report Card Staff Qualifications 2015
database_path = 'SRC2015/SRC2015.mdb'
out_tables = mdb_to_pandas(database_path)
table_names = list(out_tables.keys())
df_staff_qualifications_15 = out_tables['Staff']

print('-------')
print('\ntable names = ', table_names)
print('\ntable columns = ', df_staff_qualifications_15.columns)
print('\ntable head = ', df_staff_qualifications_15.head())

KeyError: 'Staff'