In [None]:
import numpy as np

from numpy.random import seed
seed(1)

import tensorflow
tensorflow.random.set_seed(2)

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import KFold, train_test_split

In [None]:
import os
import import_ipynb

from copy import deepcopy
import matplotlib.pyplot as plt

from build_models import *
from evaluation import performance_test_CV
from bias_inspection import get_protected_attr
from bias_mitigation import *

In [None]:
def filter_students(data):
    filtered_data = {}
    for s_id in data:
        if not(data[s_id]['pretest'] == -1 or data[s_id]['posttest'] == -1):
            # add to filtered set if pretest/posttest information is present
            filtered_data[s_id] = deepcopy(data[s_id])
            
    return filtered_data

In [None]:
def get_set(data, ids):
    filtered_data = {}
    for s_id in data:
        if s_id in ids:
            filtered_data[s_id] = deepcopy(data[s_id])
            
    return filtered_data

In [None]:
def create_train_test_split(data, split=12):
    copy_data = deepcopy(data)
    
    student_ids = np.array([s_id for s_id in copy_data])
    ids_train, ids_test = train_test_split(student_ids, test_size=0.20, random_state=split)
    
    train_data = get_set(copy_data, ids_train)
    test_data = get_set(copy_data, ids_test)
    
    return train_data, test_data

In [None]:
def fit_model(train_data, val_data, get_required_model, epochs=50, split=12, folds=5, seq_length=20,
              monitor='val_loss', baseline=False, model_unique_filename=""):
    
    # pack data to send for preprocessing
    raw_data = {'Train': train_data, 
                'Val': val_data, 
                'Test': deepcopy(val_data)}
        
    # get model and preprocessed data and labels according to the required model 
    model, data, labels, ids = get_required_model(raw_data, seq_length=seq_length)
        
    # unpack data to get train, test and validation data/labels/ids
    train_set, val_set = data['Train'], data['Val']
    train_labels, val_labels = labels['Train'], labels['Val']
    train_ids, val_ids = ids['Train'], ids['Val']
        
    model.summary()
        
    # Save best model (refer monitor criterion) here using early stopping
    model_filename = 'model_overall_'+model_unique_filename+'.h5'
        
    if os.path.exists(model_filename):
        os.remove(model_filename)
            
    callbacks = [EarlyStopping(monitor=monitor, patience=10), ModelCheckpoint(model_filename,
                                                                              save_best_only=True, 
                                                                              save_weights_only=False)]
        
    # Train model
    history = model.fit(data['Train'], labels['Train'], validation_data=(data['Val'], labels['Val']),
                        epochs=epochs, verbose=1, callbacks=callbacks, shuffle=True)
        
    try:
        model_trained = load_model(model_filename)
        train_pred = model_trained.predict(data['Train'])
        val_pred = model_trained.predict(data['Val'])
    except:
        train_pred = model.predict(data['Train'])
        val_pred = model.predict(data['Val'])
            
    clear_session()
        
    train_results = performance_test_CV(data['Train'], train_pred, labels['Train'], train_ids, "TRAIN",
                                        seq_length=seq_length, extension=extension)
    val_results = performance_test_CV(data['Val'], val_pred, labels['Val'], val_ids, "VAL",
                                      seq_length=seq_length, extension=extension)
        
    print("TRAIN SET: ", train_results)
    print("VAL SET: ", val_results)
        
    return

In [None]:
def get_instance_counts(data, protected_label, value, set_name):
    instance = 0
    for s_id in data:
        if data[s_id][protected_label] == str(value):
            instance += 1
            
    print("Set name: ", set_name, "\tProtected_attr: ", protected_label, "\tValue: ", value)
    print("Instance_count: ", instance, "\n")

In [None]:
def get_stats(copy_data, set_name):
    data = get_protected_attr(deepcopy(copy_data), protected_label="Gender")
    get_instance_counts(data, protected_label="Gender", value=1, set_name=set_name)
    get_instance_counts(data, protected_label="Gender", value=2, set_name=set_name)
    
    data = get_protected_attr(deepcopy(copy_data), protected_label="Prior_exp")
    get_instance_counts(data, protected_label="Prior_exp", value=1, set_name=set_name)
    get_instance_counts(data, protected_label="Prior_exp", value=2, set_name=set_name)
    get_instance_counts(data, protected_label="Prior_exp", value=3, set_name=set_name)
    
    return

In [None]:
def fit_model_CV(inp_data, get_required_model, epochs=50, split=12, folds=5, seq_length=20,
                 monitor='val_loss', baseline=False, model_unique_filename="", extension="", 
                 bias_mitigation=False):
    copy_data = deepcopy(inp_data)
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=split)
    fold = 0
    student_ids = np.array([s_id for s_id in copy_data])
    ratings = {'correct': [], 'incorrect': []}
    
    for train_index, test_index in kf.split(student_ids):
        ids_train, ids_test = student_ids[train_index], student_ids[test_index]
        ids_train, ids_val = train_test_split(ids_train, test_size=0.20, random_state=split)
        
        #if fold == 1:
        #    get_stats(get_set(copy_data, ids_train), set_name="Train")
        #    get_stats(get_set(copy_data, ids_val), set_name="Val")
        
        # pack data to send for preprocessing
        raw_data = {'Train': get_set(copy_data, ids_train), 
                    'Val': get_set(copy_data, ids_val), 
                    'Test': get_set(copy_data, ids_test)}
        
        # get model and preprocessed data and labels according to the required model 
        model, data, labels, ids = get_required_model(raw_data, seq_length=seq_length)
        
        # unpack data to get train, test and validation data/labels/ids
        train_set, val_set, test_set = data['Train'], data['Val'], data['Test']
        train_labels, val_labels, test_labels = labels['Train'], labels['Val'], labels['Test']
        train_ids, val_ids, test_ids = ids['Train'], ids['Val'], ids['Test']
        
        model.summary()
        
        # Save best model (refer monitor criterion) here using early stopping
        model_filename = 'model_' + model_unique_filename + 'cv' + str(fold) + '.h5'
        
        if os.path.exists(model_filename):
            os.remove(model_filename)
            
        callbacks = [EarlyStopping(monitor=monitor, patience=10), ModelCheckpoint(model_filename,
                                                                                  save_best_only=True, 
                                                                                  save_weights_only=False)]
        
        #check_dimensions(data['Train'][0][0])
        # Train model
        history = model.fit(data['Train'], labels['Train'], validation_data=(data['Val'], labels['Val']),
                            epochs=epochs, verbose=1, callbacks=callbacks, shuffle=True)
        
        try:
            model_trained = load_model(model_filename)
            train_pred = model_trained.predict(data['Train'])
            val_pred = model_trained.predict(data['Val'])
            test_pred = model_trained.predict(data['Test'])
        except:
            train_pred = model.predict(data['Train'])
            val_pred = model.predict(data['Val'])
            test_pred = model.predict(data['Test'])
            
        clear_session()
        
        train_results = performance_test_CV(data['Train'], train_pred, labels['Train'], train_ids, "TRAIN",
                                            seq_length=seq_length, extension=extension)
        val_results = performance_test_CV(data['Val'], val_pred, labels['Val'], val_ids, "VAL",
                                            seq_length=seq_length, extension=extension)
        test_results, ratings_test = performance_test_CV(data['Test'], test_pred, labels['Test'], test_ids, "TEST",
                                            seq_length=seq_length, extension=extension)
        
        print("FOLD: ", fold)
        print("TRAIN SET: ", train_results)
        print("VAL SET: ", val_results)
        print("TEST SET: ", test_results)
        
        ratings['correct'].extend(ratings_test['correct'])
        ratings['incorrect'].extend(ratings_test['incorrect'])
        
        fold += 1
    
    print(ratings)
    fig, ax = plt.subplots()
    ax.boxplot(ratings.values())
    ax.set_xticklabels(ratings.keys())
    plt.show()
        
    return

In [None]:
def check_dimensions(data, desired=60):
    for d in data:
        print(len(d))
            
    return