In [1]:
import numpy as np
import pandas as pd

In [2]:
print('Load the data')

df = pd.read_csv('mass_doe_data.csv', dtype=str)
df = df.set_index('school_id')
df = df.astype(float)
print(df.shape)

print('Drop state features')
df = df.drop(columns=[c for c in df.columns if 'state' in c.lower()])
print(df.shape)

Load the data
(636, 2775)
Drop state features
(636, 1868)


In [3]:
print('Demographics')

demographics = df[[c for c in df.columns if 'MCAS' not in c]]
print(demographics.shape)

print('Remove unchanging demographics')
demographics = demographics[demographics.columns[demographics.nunique(dropna=False) > 1]]
print(demographics.shape)

print('Remove perfectly correlated demographics')
columns_to_remove = []
corr_matrix = demographics.corr()
for a in demographics.columns:
    for b in demographics.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
demographics = demographics.drop(columns=columns_to_remove)
print(demographics.shape)

print('Create dataframe of whether or not the other dataframe is nan')
nan_demographics = demographics.isna().astype(int)
nan_demographics.columns = [f'{c}_is_nan' for c in nan_demographics.columns]
print(nan_demographics.shape)

print('Remove unchanging nan demographics')
nan_demographics = nan_demographics[nan_demographics.columns[nan_demographics.nunique(dropna=False) > 1]]
print(nan_demographics.shape)

print('Remove perfectly correlated nan demographics')
columns_to_remove = []
corr_matrix = nan_demographics.corr()
for a in nan_demographics.columns:
    for b in nan_demographics.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
nan_demographics = nan_demographics.drop(columns=columns_to_remove)
print(nan_demographics.shape)

print('Combine demographics with nan demographics')
demographics = pd.concat([demographics, nan_demographics], axis=1)
print(demographics.shape)

print('Save demographics')
demographics.to_csv('outcome_and_demographic_covariates.csv')

Demographics
(636, 392)
Remove unchanging demographics
(636, 358)
Remove perfectly correlated demographics
(636, 348)
Create dataframe of whether or not the other dataframe is nan
(636, 348)
Remove unchanging nan demographics
(636, 151)
Remove perfectly correlated nan demographics
(636, 25)
Combine demographics with nan demographics
(636, 373)
Save demographics


In [4]:
print('Prior Performance')

mcas = df[[c for c in df.columns 
           if ('MCAS' in c 
           and 'MATHEMATICS' in c 
           and ('GRADE 06' in c 
                or 'GRADE 07' in c 
                or 'GRADE 08' in c))
           or c == 'school_id']]
mcas = mcas.reindex(sorted(mcas.columns), axis=1)
print(mcas.shape)

print('Fix web scraping typo')
for i, row in mcas.iterrows():
    for c in mcas.columns:
        if 'MATHEMATICS3' in c:
            other_c = c.replace('MATHEMATICS3', 'MATHEMATICS')
            if not np.isnan(row[c]) and np.isnan(row[other_c]):
                mcas.loc[i, other_c] = row[c]
                mcas.loc[i, c] = np.nan
mcas = mcas.drop(columns=[c for c in mcas.columns if 'MATHEMATICS3' in c])
print(mcas.shape)

print('Stack by year')
new_dfs = []
for _, row in mcas.iterrows():
    year_serieses = []
    years = ['2011', '2012', '2013', '2014', '2015']
    for year in years:
        new_index = ['__'.join(c.split('__')[1:]) for c in row.index if 'MCAS' in c and year in c]
        year_series = row.loc[[c for c in row.index if 'MCAS' in c and year in c]]
        year_series.index = new_index
        year_series['year'] = year
        year_serieses.append(year_series)
        new_df = pd.concat(year_serieses, axis=1).T
        new_df = new_df.reset_index().rename(columns={'index': 'school_id'})
    new_dfs.append(new_df)
mcas = pd.concat(new_dfs, axis=0).set_index('school_id')
mcas = mcas.astype(float)
print(mcas.shape)

print('Remove unchanging mcas')
mcas = mcas[mcas.columns[mcas.nunique(dropna=False) > 1]]
print(mcas.shape)

print('Remove perfectly correlated mcas')
columns_to_remove = []
corr_matrix = mcas.corr()
for a in mcas.columns:
    for b in mcas.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
mcas = mcas.drop(columns=columns_to_remove)
print(mcas.shape)

print('Create dataframe of whether or not the other dataframe is nan')
nan_mcas = mcas.isna().astype(int)
nan_mcas.columns = [f'{c}_is_nan' for c in nan_mcas.columns]
print(nan_mcas.shape)

print('Remove unchanging nan mcas')
nan_mcas = nan_mcas[nan_mcas.columns[nan_mcas.nunique(dropna=False) > 1]]
print(nan_mcas.shape)

print('Remove perfectly correlated nan mcas')
columns_to_remove = []
corr_matrix = nan_mcas.corr()
for a in nan_mcas.columns:
    for b in nan_mcas.columns:
        if a not in columns_to_remove and b not in columns_to_remove and a != b and corr_matrix.loc[a, b] == 1:
            columns_to_remove.append(b)
nan_mcas = nan_mcas.drop(columns=columns_to_remove)
print(nan_mcas.shape)

print('Combine mcas with nan mcas')
mcas = pd.concat([mcas, nan_mcas], axis=1)
print(mcas.shape)

print('Save mcas data')
demographics.to_csv('temporal_prior_performance_covariates.csv')

Prior Performance
(636, 144)
Fix web scraping typo
(636, 135)
Stack by year
(3180, 28)
Remove unchanging mcas
(3180, 28)
Remove perfectly correlated mcas
(3180, 28)
Create dataframe of whether or not the other dataframe is nan
(3180, 28)
Remove unchanging nan mcas
(3180, 27)
Remove perfectly correlated nan mcas
(3180, 12)
Combine mcas with nan mcas
(3180, 40)
Save mcas data


In [5]:
print('Convert to series of numpy arrays')
mcas = mcas.groupby('school_id')
mcas = mcas.apply(lambda x: x.sort_values('year').drop(columns='year').values)
print(mcas.shape)

print('Combine with demographics dataframe')
demographics['prior_performance'] = mcas
print(demographics.shape)

print('Save full dataframe as pickle')
demographics.to_pickle('clean_data.pkl')

Convert to series of numpy arrays
(636,)
Combine with demographics dataframe
(636, 374)
Save full dataframe as pickle
