In [176]:
from sklearn import preprocessing
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
from tqdm import tqdm

from pathlib import Path
import random

In [4]:
PROCESSED_DATA_DIR_PATH = Path('.') / '..' / 'data' / 'processed_data'

<a id='baselines'></a>

# BASELINE MODELS

Average the entire year of data for each district. Same model is used for all states.

This yields only ~173 rows, so we use k-fold Cross Validation to measure model performance.

### Test Score Baselines

Each baseline trains 4 Ridge regression models ($\alpha = 0.5$), one for each comination of 4th & 8th grade, math & reading test scores, using 5- and 10-fold cross-validation. We select the better performing of the two cross-validations (on the training set) and select the corresponding test results.

- ___Baseline 1___: Uses engagement data only. ([Model](#baseline1_model) / [Results](#baseline1_results))
- ___Baseline 2___: Uses locale type only. Outperformed Baseline 1. ([Model](#baseline2_model) / [Results](#baseline2_results))

In [37]:
INPUT_FEATURES = ['engagement;pct_access',
                  'engagement;engagement_index',
                  'districts;locale',
                  'districts;state',
                 ]

TARGET_FEATURES = ['testscores;math_4_2022',
                   'testscores;math_8_2022',
                   'testscores;reading_4_2022',
                   'testscores;reading_8_2022',
                  ]

In [253]:
dataset = None

for state_data_path in tqdm(PROCESSED_DATA_DIR_PATH.glob('*.gz'), total=23):
    df = pd.read_pickle(state_data_path)

    df_temp = df.groupby(['time', 'district_id']).first()

    columns_to_average = ['engagement;pct_access', 'engagement;engagement_index']
    columns_to_reset = ['products;Sector(s)', 'products;Primary Essential Function']

    df_temp[columns_to_average] = df.groupby(['time', 'district_id']).mean()[columns_to_average]
    df_temp[columns_to_reset] = None

    df = df_temp
    del df_temp
    
    for district_id in df.index.levels[1]:
        df_temp = df.loc[(slice(None), district_id), :]
        row = df_temp.iloc[0][INPUT_FEATURES + TARGET_FEATURES]
        row.loc[columns_to_average] = df_temp[columns_to_average].mean()
        row['n_days'] = len(df_temp)
        if dataset is None:
            dataset = pd.DataFrame(row).T.reset_index(level=0)
        else:
            dataset = pd.concat([dataset, pd.DataFrame(row).T.reset_index(level=0)])
            
for l in ['City', 'Suburb', 'Rural', 'Town']:
    dataset['districts;locale_{}'.format(l.lower())] = dataset['districts;locale'].apply(lambda x: int(x == l))
dataset = dataset.drop(['level_0', 'districts;locale'], axis=1)


100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:19<00:00,  1.17it/s]


In [242]:
def print_results(results_dict, target_feature, scoring_metrics_list,
                  splits=['train']):
    d = {}
    for scoring_metric in scoring_metrics_list:
        for split in splits:
            m = '{}_{}'.format(split, scoring_metric)
            r = round(np.mean(results_dict[m]), 4)
            if scoring_metric.startswith('neg'):
                r = -r
                m = '{}_{}'.format(split, scoring_metric[4:])

            d[m] = {target_feature: r}
    display(pd.DataFrame(d).T)
        

<a id='baseline1_model'></a>

## Baseline 1: Predict Math and Reading Scores from Engagement Data Only

[(Back to top)](#baselines)

In [243]:
scoring_metrics_list = ['r2', 'neg_root_mean_squared_error']

five_fold_results = []
ten_fold_results = []

for target in ['testscores;math_4_2022', 
               'testscores;math_8_2022', 
               'testscores;reading_4_2022', 
               'testscores;reading_8_2022']:
    
    X = np.array(dataset[['engagement;pct_access', 'engagement;engagement_index']]).astype(float)
    Y = np.array(dataset[[target]]).astype(float).squeeze(-1)
    
    X = X[~np.isnan(Y)]
    Y = Y[~np.isnan(Y)]
    
    model = make_pipeline(preprocessing.StandardScaler(), 
                          Ridge(alpha=0.5))
    
    results_dict = cross_validate(model, X, Y, 
                                  cv=5, 
                                  scoring=scoring_metrics_list,
                                  return_train_score=True)
    five_fold_results.append((results_dict, target, scoring_metrics_list))

    results_dict = cross_validate(model, X, Y, 
                                  cv=10, 
                                  scoring=scoring_metrics_list,
                                  return_train_score=True)
    ten_fold_results.append((results_dict, target, scoring_metrics_list))
    

### 5-Fold Training Results

In [244]:
for results in five_fold_results:
    print_results(*results)

Unnamed: 0,testscores;math_4_2022
train_r2,0.0289
train_root_mean_squared_error,7.0879


Unnamed: 0,testscores;math_8_2022
train_r2,0.0298
train_root_mean_squared_error,6.8963


Unnamed: 0,testscores;reading_4_2022
train_r2,0.021
train_root_mean_squared_error,7.4763


Unnamed: 0,testscores;reading_8_2022
train_r2,0.0207
train_root_mean_squared_error,5.8421


### 10-Fold Training Results

In [245]:
for results in ten_fold_results:
    print_results(*results)

Unnamed: 0,testscores;math_4_2022
train_r2,0.0189
train_root_mean_squared_error,7.1278


Unnamed: 0,testscores;math_8_2022
train_r2,0.0253
train_root_mean_squared_error,6.9224


Unnamed: 0,testscores;reading_4_2022
train_r2,0.0139
train_root_mean_squared_error,7.5127


Unnamed: 0,testscores;reading_8_2022
train_r2,0.0161
train_root_mean_squared_error,5.863


Five-fold outperformed ten-fold (though neither did very well). We check the test performance:

<a id='baseline1_results'></a>

### Baseline 1 Testing Results

[(Back to top)](#baselines)

In [246]:
for results in five_fold_results:
    print_results(*results, splits=['test'])

Unnamed: 0,testscores;math_4_2022
test_r2,-0.0974
test_root_mean_squared_error,7.2542


Unnamed: 0,testscores;math_8_2022
test_r2,-0.105
test_root_mean_squared_error,7.0224


Unnamed: 0,testscores;reading_4_2022
test_r2,-0.1797
test_root_mean_squared_error,7.8074


Unnamed: 0,testscores;reading_8_2022
test_r2,-0.1293
test_root_mean_squared_error,5.9837


<a id='baseline2_model'></a>

## Baseline 2: Predict Math and Reading Scores from Locale Type Only

[(Back to top)](#baselines)

In [249]:
scoring_metrics_list = ['r2', 'neg_root_mean_squared_error']

five_fold_results = []
ten_fold_results = []

for target in ['testscores;math_4_2022', 
               'testscores;math_8_2022', 
               'testscores;reading_4_2022', 
               'testscores;reading_8_2022']:
    
    X = np.array(dataset[['districts;locale_city', 
                          'districts;locale_suburb', 
                          'districts;locale_town',
                          'districts;locale_rural']]).astype(float)
    Y = np.array(dataset[[target]]).astype(float).squeeze(-1)
    
    X = X[~np.isnan(Y)]
    Y = Y[~np.isnan(Y)]
    
    model = make_pipeline(preprocessing.StandardScaler(), 
                          Ridge(alpha=0.5))
    
    results_dict = cross_validate(model, X, Y, 
                                  cv=5, 
                                  scoring=scoring_metrics_list,
                                  return_train_score=True)
    five_fold_results.append((results_dict, target, scoring_metrics_list))

    results_dict = cross_validate(model, X, Y, 
                                  cv=10, 
                                  scoring=scoring_metrics_list,
                                  return_train_score=True)
    ten_fold_results.append((results_dict, target, scoring_metrics_list))
    

### 5-Fold Training Results

In [250]:
for results in five_fold_results:
    print_results(*results)

Unnamed: 0,testscores;math_4_2022
train_r2,0.4656
train_root_mean_squared_error,5.2219


Unnamed: 0,testscores;math_8_2022
train_r2,0.5177
train_root_mean_squared_error,4.854


Unnamed: 0,testscores;reading_4_2022
train_r2,0.3897
train_root_mean_squared_error,5.8842


Unnamed: 0,testscores;reading_8_2022
train_r2,0.5095
train_root_mean_squared_error,4.125


### 10-Fold Training Results

In [251]:
for results in ten_fold_results:
    print_results(*results)

Unnamed: 0,testscores;math_4_2022
train_r2,0.4589
train_root_mean_squared_error,5.2873


Unnamed: 0,testscores;math_8_2022
train_r2,0.5142
train_root_mean_squared_error,4.8834


Unnamed: 0,testscores;reading_4_2022
train_r2,0.3811
train_root_mean_squared_error,5.9485


Unnamed: 0,testscores;reading_8_2022
train_r2,0.508
train_root_mean_squared_error,4.1417


Five-fold still outperforms ten-fold _(also note that performance is substantially better than using the engagement data)_. We check the test performance:

<a id='baseline2_results'></a>

### Baseline 2 Testing Results

[(Back to top)](#baselines)

In [252]:
for results in five_fold_results:
    print_results(*results, splits=['test'])

Unnamed: 0,testscores;math_4_2022
test_r2,-0.0552
test_root_mean_squared_error,5.9152


Unnamed: 0,testscores;math_8_2022
test_r2,0.2371
test_root_mean_squared_error,5.2873


Unnamed: 0,testscores;reading_4_2022
test_r2,-0.0838
test_root_mean_squared_error,6.7581


Unnamed: 0,testscores;reading_8_2022
test_r2,0.2338
test_root_mean_squared_error,4.4496


-----------