In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Get 2013 middle school MCAS performance
# Performance is the average advanced or proficient % per grade weighted by the number of students in the grade

dfs = []
for file in os.listdir('xlsx'):
    if 'mcas_2013' in file:
        df = pd.read_excel(f'xlsx/{file}', header=1, dtype=str).astype(str)
        df = df[df['Subject'] == 'MATHEMATICS']
        df = df.rename(columns={'School Code':'school_code'})
        df = df.set_index('school_code')
        df = df[['P+A %', 'No. of Students Included']]
        df = df.astype(float)
        dfs.append(df)
df = pd.concat(dfs, axis=0)
df = df.rename(columns={'P+A %': 'x', 'No. of Students Included': 'w'})
df['xw'] = df['x'] * df['w']
group = df.groupby('school_code')
performance = group['xw'].sum() / group['w'].sum()
performance = performance.rename('performance')
performance = performance.reset_index()
performance = performance.set_index('school_code')
performance = performance[performance.index != '00000000']

print('Get average performance for each middle school')
print(performance.shape)

Get average performance for each middle school
(632, 1)


In [3]:
# Combine all the demographics for the 2013 school year

demographics = pd.DataFrame(index=performance.index)
for file in os.listdir('xlsx'):
    if '2013' in file and 'mcas' not in file:
        df = pd.read_excel(f'xlsx/{file}', header=1, dtype=str).astype(str)
        df = df.iloc[:, 1:]
        df = df.rename(columns={df.columns[0]:'school_code'})
        df = df.set_index('school_code')
        df = df.applymap(lambda x: x.replace(' ', '').replace(',', '').replace('to1', '').replace('######', 'NaN'))
        df = df.astype(float)
        df = df[[c for c in df.columns if c not in demographics.columns]]
        demographics = demographics.merge(df, how='outer', left_index=True, right_index=True)

print('Remove demographics from non middle schools')
demographics = demographics[demographics.index.isin(performance.index)]
print(demographics.shape)

print('Remove unchanging demographics')
demographics = demographics[demographics.columns[demographics.nunique(dropna=False) > 1]]
print(demographics.shape)

print('Remove perfectly correlated demographics')
columns_to_remove = []
corr_matrix = demographics.corr()
for a in demographics.columns:
    for b in demographics.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
demographics = demographics.drop(columns=columns_to_remove)
print(demographics.shape)

print('Create dataframe of whether or not the other dataframe is nan')
nan_demographics = demographics.isna().astype(int)
nan_demographics.columns = [f'{c}_is_nan' for c in nan_demographics.columns]
print(nan_demographics.shape)

print('Remove unchanging nan demographics')
nan_demographics = nan_demographics[nan_demographics.columns[nan_demographics.nunique(dropna=False) > 1]]
print(nan_demographics.shape)

print('Remove perfectly correlated nan demographics')
columns_to_remove = []
corr_matrix = nan_demographics.corr()
for a in nan_demographics.columns:
    for b in nan_demographics.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
nan_demographics = nan_demographics.drop(columns=columns_to_remove)
print(nan_demographics.shape)

print('Combine demographics with nan demographics')
demographics = pd.concat([demographics, nan_demographics], axis=1)
print(demographics.shape)

Remove demographics from non middle schools
(632, 68)
Remove unchanging demographics
(632, 68)
Remove perfectly correlated demographics
(632, 68)
Create dataframe of whether or not the other dataframe is nan
(632, 68)
Remove unchanging nan demographics
(632, 6)
Remove perfectly correlated nan demographics
(632, 1)
Combine demographics with nan demographics
(632, 69)


In [33]:
# Get prior middle school MCAS results

priors = []
for year in [2007, 2008, 2009, 2010, 2011]:
    year_df = pd.DataFrame(index=performance.index)
    year_df['year'] = year
    for grade in [6, 7, 8]:
        grade_df = pd.read_excel(f'xlsx/mcas_{year}-{grade}.xlsx', header=1, dtype=str).astype(str)
        grade_df = grade_df[grade_df['Subject'] == 'MATHEMATICS']
        grade_df = grade_df.drop(columns=['School Name', 'Subject'])
        grade_df = grade_df.rename(columns={'School Code':'school_code'})
        grade_df = grade_df.set_index('school_code')
        grade_df = grade_df.applymap(lambda x: x.replace('nan', ''))
        grade_df = grade_df.apply(lambda x: pd.to_numeric(x.str.strip()))
        grade_df = grade_df.astype(float)
        grade_df.columns = [f'Grade {grade} {c}' for c in grade_df.columns]
        year_df = year_df.merge(grade_df, how='outer', left_index=True, right_index=True)
    priors.append(year_df)

priors = pd.concat(priors, axis=0)

print('Remove priors from non middle schools')
priors = priors[priors.index.isin(performance.index)]
print(priors.shape)

print('Remove unchanging priors')
priors = priors[priors.columns[priors.nunique(dropna=False) > 1]]
print(priors.shape)

print('Remove perfectly correlated priors')
columns_to_remove = []
corr_matrix = priors.corr()
for a in priors.columns:
    for b in priors.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
priors = priors.drop(columns=columns_to_remove)
print(priors.shape)

print('Create dataframe of whether or not the other dataframe is nan')
nan_priors = priors.isna().astype(int)
nan_priors.columns = [f'{c}_is_nan' for c in nan_priors.columns]
print(nan_priors.shape)

print('Remove unchanging nan priors')
nan_priors = nan_priors[nan_priors.columns[nan_priors.nunique(dropna=False) > 1]]
print(nan_priors.shape)

print('Remove perfectly correlated nan priors')
columns_to_remove = []
corr_matrix = nan_priors.corr()
for a in nan_priors.columns:
    for b in nan_priors.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
nan_priors = nan_priors.drop(columns=columns_to_remove)
print(nan_priors.shape)

print('Combine priors with nan priors')
priors = pd.concat([priors, nan_priors], axis=1)
print(priors.shape)

print('Reshape priors into series of years')
priors = priors.set_index('year', append=True)
priors = priors.groupby('school_code').apply(lambda x: x.sort_values('year').values.tolist())
priors = priors.rename('prior_performance')
print(priors.shape)

Remove priors from non middle schools
(3160, 43)
Remove unchanging priors
(3160, 43)
Remove perfectly correlated priors
(3160, 43)
Create dataframe of whether or not the other dataframe is nan
(3160, 43)
Remove unchanging nan priors
(3160, 42)
Remove perfectly correlated nan priors
(3160, 9)
Combine priors with nan priors
(3160, 52)
Reshape priors into series of years
(632,)


In [34]:
df = performance.merge(demographics, how='inner', left_index=True, right_index=True)
df = df.merge(priors, how='inner', left_index=True, right_index=True)
df.to_csv('csv/mass_doe_data.csv')