In [16]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import keras
import tensorflow as tf
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers import Input, RepeatVector, Dense, Bidirectional, LSTM, Concatenate

# Hide GPU from visible devices
tf.config.set_visible_devices([], 'GPU')


In [18]:
# Get groups of schools

matches = pd.read_csv('input_data/matches.csv', dtype=str)
treatment_schools = matches[matches['trt'] == '1']['school_code'].tolist()
control_schools = matches[(matches['trt'] != '1') & (~matches['pairmatch'].isna())]['school_code'].tolist()
remnant_schools = matches[matches['pairmatch'].isna()]['school_code'].tolist()
print(len(matches), len(treatment_schools), len(control_schools), len(remnant_schools), len(treatment_schools) + len(control_schools) + len(remnant_schools))

521 37 37 447 521


In [21]:
# Prepare data

mass_doe_data = pd.read_csv('input_data/mass_doe_data.csv', dtype=str).astype(str)
mass_doe_data = mass_doe_data.set_index('school_code')
float_columns = [c for c in mass_doe_data if c != 'prior_performance']
mass_doe_data[float_columns] = mass_doe_data[float_columns].astype(float)
mass_doe_data['prior_performance'] = mass_doe_data['prior_performance'].apply(lambda x: np.array(eval(x.replace('nan', 'np.nan'))))
mass_doe_data = mass_doe_data.sort_values('school_code')

performance_data = pd.read_csv('input_data/performance.csv', dtype=str).astype(str)
performance_data = performance_data.set_index('school_code')
performance_data = performance_data.astype(float)
performance_data = performance_data.sort_values('school_code')

# Remnant data

remnant_mass_doe_data = mass_doe_data[mass_doe_data.index.isin(remnant_schools)]
remnant_performance_data = performance_data[performance_data.index.isin(remnant_schools)]
remnant_demographics = remnant_mass_doe_data.drop(columns=['prior_performance']).values
remnant_prior_performance = np.array(remnant_mass_doe_data['prior_performance'].tolist())
remnant_performance = remnant_performance_data['performance'].values
remnant_school_codes = np.array(remnant_performance_data.index)
print(remnant_demographics.shape, remnant_prior_performance.shape, remnant_performance.shape, remnant_school_codes.shape)

# Experiment data

experiment_mass_doe_data = mass_doe_data[mass_doe_data.index.isin(treatment_schools + control_schools)]
experiment_performance_data = performance_data[performance_data.index.isin(treatment_schools + control_schools)]
experiment_demographics = experiment_mass_doe_data.drop(columns=['prior_performance']).values
experiment_prior_performance = np.array(experiment_mass_doe_data['prior_performance'].tolist())
experiment_performance = experiment_performance_data['performance'].values
experiment_school_codes = np.array(experiment_performance_data.index)
print(experiment_demographics.shape, experiment_prior_performance.shape, experiment_performance.shape, experiment_school_codes.shape)


(447, 69) (447, 5, 51) (447,) (447,)
(74, 69) (74, 5, 51) (74,) (74,)


In [30]:
# Get model quality using k-fold on remnant

ff_X = remnant_demographics
lstm_X = remnant_prior_performance
y = remnant_performance
i = remnant_school_codes

results = []
for train_index, test_index in KFold(n_splits=10, shuffle=True).split(ff_X, y):

    # Clear session so models don't pile up
    keras.backend.clear_session()

    # Split data into training and testing splits
    train_ff_X, test_ff_X = ff_X[train_index], ff_X[test_index]
    train_lstm_X, test_lstm_X = lstm_X[train_index], lstm_X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    train_i, test_i = i[train_index], i[test_index]

    # Normalize the input data based on the training data distribution
    ff_scaler = StandardScaler().fit(train_ff_X)
    train_ff_X = np.nan_to_num(ff_scaler.transform(train_ff_X))
    test_ff_X = np.nan_to_num(ff_scaler.transform(test_ff_X))

    train_lstm_X_shape = train_lstm_X.shape
    train_stacked_lstm_X = train_lstm_X.reshape(-1, train_lstm_X_shape[-1])
    lstm_scaler = StandardScaler().fit(train_stacked_lstm_X)
    train_lstm_X = np.nan_to_num(lstm_scaler.transform(train_stacked_lstm_X)).reshape(train_lstm_X_shape)
    test_lstm_X_shape = test_lstm_X.shape
    test_stacked_lstm_X = test_lstm_X.reshape(-1, test_lstm_X_shape[-1])
    test_lstm_X = np.nan_to_num(lstm_scaler.transform(test_stacked_lstm_X)).reshape(test_lstm_X_shape)

    # Create the neural network
    ff_input_layer = Input(shape=train_ff_X[0].shape)
    lstm_input_layer = Input(shape=train_lstm_X[0].shape)
    combined_input_layer = RepeatVector(train_lstm_X.shape[1])(ff_input_layer)
    combined_input_layer = Concatenate()([combined_input_layer, lstm_input_layer])

    model = Bidirectional(LSTM(units=128, return_sequences=True, activation='tanh', dropout=0.5, recurrent_dropout=0.5))(combined_input_layer)
    model = Bidirectional(LSTM(units=64, return_sequences=False, activation='tanh', dropout=0.5, recurrent_dropout=0.5))(model)
    output_layer = Dense(units=1, activation='linear')(model)

    model = Model([ff_input_layer, lstm_input_layer], output_layer)
    model.compile(optimizer='adam', loss='mse')

    # Train the neural network
    es = [EarlyStopping(monitor='val_loss', patience=10, min_delta=0, restore_best_weights=True)]
    model.fit(x=[train_ff_X, train_lstm_X], y=train_y, batch_size=16, epochs=1000, validation_split=0.25, callbacks=es, verbose=1)

    # Use the neural network to predict the held-out fold
    pred_y = model.predict([test_ff_X, test_lstm_X]).flatten()

    # Update predictions
    results.append(pd.DataFrame([test_y, pred_y], columns=test_i, index=['actual_performance', 'predicted_performance']).T)

results = pd.concat(results)
results.to_csv('results/kfold_remnant_predictions.csv')
print(f'MSE: {mean_squared_error(results.actual_performance, results.predicted_performance)}')

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1

In [None]:
'''
Mean Prediction MSE: 433.9361577972904

(layers)     , dropout , batch size: MSE

(64, 32)     , 0       , 32        : 261.0934134399656
(32)         , 0       , 32        : 408.9117164445174
(64, 32)     , 0.5     , 32        : 227.8827418178931
(64, 64)     , 0.25    , 32        : 213.6578965631596
(64, 64, 64) , 0.25    , 32        : 264.9091609712526
(128, 64)    , 0.5     , 32        : 170.0072276619494
(128, 64)    , 0.5     , 16        : 163.5255262041347 / 199.23285354558072***
(128, 64)    , 0.5     , 8         : 183.8387565630688
(256, 256)   , 0.5     , 16        : 163.1883336295512
(128, 128)   , 0.5     , 16        : 172.0166126039364
(256, 128)   , 0.5     , 16        : 177.0513539259982
'''

In [31]:
# Get predictions for experiment data

train_ff_X = remnant_demographics
train_lstm_X = remnant_prior_performance
train_y = remnant_performance
train_i = remnant_school_codes

test_ff_X = experiment_demographics
test_lstm_X = experiment_prior_performance
test_y = experiment_performance
test_i = experiment_school_codes

# Clear session so models don't pile up
keras.backend.clear_session()

# Normalize the input data based on the training data distribution
ff_scaler = StandardScaler().fit(train_ff_X)
train_ff_X = np.nan_to_num(ff_scaler.transform(train_ff_X))
test_ff_X = np.nan_to_num(ff_scaler.transform(test_ff_X))

train_lstm_X_shape = train_lstm_X.shape
train_stacked_lstm_X = train_lstm_X.reshape(-1, train_lstm_X_shape[-1])
lstm_scaler = StandardScaler().fit(train_stacked_lstm_X)
train_lstm_X = np.nan_to_num(lstm_scaler.transform(train_stacked_lstm_X)).reshape(train_lstm_X_shape)
test_lstm_X_shape = test_lstm_X.shape
test_stacked_lstm_X = test_lstm_X.reshape(-1, test_lstm_X_shape[-1])
test_lstm_X = np.nan_to_num(lstm_scaler.transform(test_stacked_lstm_X)).reshape(test_lstm_X_shape)

# Create the neural network
ff_input_layer = Input(shape=train_ff_X[0].shape)
lstm_input_layer = Input(shape=train_lstm_X[0].shape)
combined_input_layer = RepeatVector(train_lstm_X.shape[1])(ff_input_layer)
combined_input_layer = Concatenate()([combined_input_layer, lstm_input_layer])

model = Bidirectional(LSTM(units=128, return_sequences=True, activation='tanh', dropout=0.5, recurrent_dropout=0.5))(combined_input_layer)
model = Bidirectional(LSTM(units=64, return_sequences=False, activation='tanh', dropout=0.5, recurrent_dropout=0.5))(model)
output_layer = Dense(units=1, activation='linear')(model)

model = Model([ff_input_layer, lstm_input_layer], output_layer)
model.compile(optimizer='adam', loss='mse')

# Train the neural network
es = [EarlyStopping(monitor='val_loss', patience=10, min_delta=0, restore_best_weights=True)]
model.fit(x=[train_ff_X, train_lstm_X], y=train_y, batch_size=16, epochs=1000, validation_split=0.25, callbacks=es, verbose=1)

# Use the neural network to predict the held-out fold
pred_y = model.predict([test_ff_X, test_lstm_X]).flatten()

# Update predictions
results = pd.DataFrame([test_y, pred_y], columns=test_i, index=['actual_performance', 'predicted_performance']).T
results.to_csv('results/experiment_predictions.csv')
print(f'MSE: {mean_squared_error(results.actual_performance, results.predicted_performance)}')

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
MSE: 100.23964504026503
