In [9]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import brier_score_loss
from sklearn.metrics import accuracy_score

import re
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [10]:
pd.set_option('display.max_columns', None)

In [11]:
path_to_data = '../data/'

np.random.seed(42)

In [12]:
# uncomment for women's analyses:

m_or_w = 'WOMEN'

raw_reg_szn = pd.read_csv(os.path.join(path_to_data, 'WRegularSeasonCompactResults.csv'))
raw_conf_tourn = pd.read_csv(os.path.join(path_to_data, 'WConferenceTourneyGames.csv'))
team_data = pd.read_csv(os.path.join(path_to_data, 'WTeams.csv'))
raw_mm_tourn = pd.read_csv(os.path.join(path_to_data, 'WNCAATourneyCompactResults.csv'))
raw_secondary_tourn = pd.read_csv(os.path.join(path_to_data, 'WSecondaryTourneyCompactResults.csv'))
conf_data = pd.read_csv(os.path.join(path_to_data, 'WTeamConferences.csv'))
tourney_slots = pd.read_csv(os.path.join(path_to_data, 'WNCAATourneySlots.csv'))

# does not exist for women, but should be the same as for men
seed_slots = pd.read_csv(os.path.join(path_to_data, 'MNCAATourneySeedRoundSlots.csv'))

tourney_seeds = pd.read_csv(os.path.join(path_to_data, 'WNCAATourneySeeds.csv'))

train_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'WOMENS_train_diffs.csv'), index_col= 0)
val_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'WOMENS_val_diffs.csv'), index_col= 0)
reg_szn_diffs_2025_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'WOMENS_test_diffs.csv'), index_col= 0)


In [13]:
# # uncomment for men's analyses

# m_or_w = 'MEN'

# raw_reg_szn = pd.read_csv(os.path.join(path_to_data, 'MRegularSeasonCompactResults.csv'))
# raw_conf_tourn = pd.read_csv(os.path.join(path_to_data, 'MConferenceTourneyGames.csv'))
# team_data = pd.read_csv(os.path.join(path_to_data, 'MTeams.csv'))
# raw_mm_tourn = pd.read_csv(os.path.join(path_to_data, 'MNCAATourneyCompactResults.csv'))
# raw_secondary_tourn = pd.read_csv(os.path.join(path_to_data, 'MSecondaryTourneyCompactResults.csv'))
# conf_data = pd.read_csv(os.path.join(path_to_data, 'MTeamConferences.csv'))
# tourney_slots = pd.read_csv(os.path.join(path_to_data, 'MNCAATourneySlots.csv'))
# seed_slots = pd.read_csv(os.path.join(path_to_data, 'MNCAATourneySeedRoundSlots.csv'))
# tourney_seeds = pd.read_csv(os.path.join(path_to_data, 'MNCAATourneySeeds.csv'))

# train_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'MENS_train_diffs.csv'), index_col= 0)
# val_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'MENS_val_diffs.csv'), index_col= 0)
# reg_szn_diffs_2025_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'MENS_test_diffs.csv'), index_col= 0)


In [14]:
train_years = [2003, 2004, 2006, 2007, 2008, 2010, 2011, 2013, 2014, 2015, 2017, 2018, 2021, 2022, 2024]
val_years = [2005, 2009, 2012, 2016, 2019, 2023]

In [15]:
# function definitions

def is_conference_game(team1, team2, season, conf_df = conf_data):
    team1_conf = conf_df['ConfAbbrev'].loc[(conf_df['Season'] == season) & (conf_df['TeamID'] == team1)].values[0]
    team2_conf = conf_df['ConfAbbrev'].loc[(conf_df['Season'] == season) & (conf_df['TeamID'] == team2)].values[0]

    if team1_conf == team2_conf:
        return True
    return False

def get_elo_win_prob(elo1, elo2, home_court_advantage_boost_1 = 0, home_court_advantage_boost_2 = 0):
    '''
    returns probability of team1 (with elo1) beating team 2 (with elo2)
    '''

    elo1 += home_court_advantage_boost_1
    elo2 += home_court_advantage_boost_2

    return  1 / (1 + 10 ** ((elo2 - elo1) / 400))



def update_elo(prev_elo1, prev_elo2, k, winner, hc_boost_1, hc_boost_2):
    '''
    returns updated elos for team 1 (with prev_elo1) and team 2 (prev_elo2) based on k
    we don't care about margin of victory, just wins and losses:
    '''

    team1_winprob = get_elo_win_prob(prev_elo1, prev_elo2, hc_boost_1, hc_boost_2)
    team2_winprob = 1 - team1_winprob

    if winner == 1:

        new_elo1 = prev_elo1 + k*(1 - team1_winprob)
        new_elo2 = prev_elo2 + k*(0 - team1_winprob)

    elif winner == 2:

        new_elo1 = prev_elo1 + k*(0 - team1_winprob)
        new_elo2 = prev_elo2 + k*(1 - team1_winprob)

    return new_elo1, new_elo2 



def make_final_elo_df(year, score_result_df, k_scheme, home_team_adjustment = 0, fix_k = False, team_elo_dict = None):
    '''
    Make a df of teams' elos at the end of the regular season
    
    '''

    season_df = score_result_df.loc[score_result_df['Season'] == year]

    # get all unique team ids in the season:
    all_team_ids = np.unique(season_df[['WTeamID', 'LTeamID']].values)

    if team_elo_dict is None:
        # initialize all teams' elo to 1500 to begin the season
        team_elo_dict = {int(team):[1500] for team in all_team_ids}

    for rownum, rowvals in tqdm(season_df.iterrows()):
        
        winning_team = int(rowvals['WTeamID'])
        losing_team = int(rowvals['LTeamID'])

        winning_team_prev_elo = team_elo_dict[winning_team][-1]
        losing_team_prev_elo = team_elo_dict[losing_team][-1]

        conf_game = is_conference_game(team1 = winning_team, 
                                       team2 = losing_team,
                                       season = year, 
                                       conf_df = conf_data)
        if fix_k:
            this_k = k_scheme['fixed']
        else:
            if conf_game:
                this_k = k_scheme['conf']
            else:
                this_k = k_scheme['ooc']

        if home_team_adjustment != 0:
            # if the winning team was at home ... 
            if rowvals['WLoc'] == 'H':
                winning_elo_adj = home_team_adjustment
                losing_elo_adj = 0
            
            # if the losing team was at home ... 
            elif rowvals['WLoc'] == 'A':
                winning_elo_adj = 0
                losing_elo_adj = home_team_adjustment
            
            # if neutral site, neither team gets an elo boost
            elif rowvals['WLoc'] == 'N':
                winning_elo_adj = 0
                losing_elo_adj = 0

        # useful for conference tourney games that lack this data ... 
        else:
            winning_elo_adj = 0
            losing_elo_adj = 0

            


        new_winner_elo, new_loser_elo = update_elo(prev_elo1=winning_team_prev_elo,
                                                prev_elo2 = losing_team_prev_elo,
                                                k = this_k,
                                                winner = 1,
                                                hc_boost_1 = winning_elo_adj,
                                                hc_boost_2= losing_elo_adj
                                                )
        
        team_elo_dict[winning_team].append(new_winner_elo)
        team_elo_dict[losing_team].append(new_loser_elo)

    final_elos = {team_id:elo_scores[-1] for team_id, elo_scores in team_elo_dict.items()}
    final_elo_df = pd.DataFrame.from_dict(final_elos, orient = 'index')
    final_elo_df.columns = ['elo']
    final_elo_df = pd.merge(left = final_elo_df, right = team_data, left_index=True, right_on = 'TeamID')
    final_elo_df = final_elo_df.sort_values('elo', ascending = False)

    return team_elo_dict, final_elo_df


def create_x_y_data(season, reg_szn_data, final_elo_df_szn, stat_feats, include_winner_in_x = False, 
                    include_secondary_tourney_data = True,
                    include_conf_tourney_data = False,
                    m_or_w = m_or_w):
    
    
    '''
    if include_conf_tourney_data, we will NOT adjust ELO from conference tournament results; we will INCLUDE it in the training process
    '''
    
    # train optimal mod:
    if m_or_w == 'MEN':
        prefix = 'M'
    elif m_or_w == 'WOMEN':
        prefix = 'W'


    raw_mm_tourn = pd.read_csv(os.path.join(path_to_data, f'{prefix}NCAATourneyCompactResults.csv'))
    raw_secondary_tourn = pd.read_csv(os.path.join(path_to_data, f'{prefix}SecondaryTourneyCompactResults.csv'))
    raw_conf_tourn = pd.read_csv(os.path.join(path_to_data, f'{prefix}ConferenceTourneyGames.csv'))

    my_regszn_szn = reg_szn_data.loc[reg_szn_data['Season'] == season]
    conf_data_szn = conf_data.loc[conf_data['Season'] == season]
    regszn_avgs_szn = my_regszn_szn.groupby('TeamID')[stat_feats].mean()

    if include_secondary_tourney_data:
        

        raw_mm_tourn_szn = raw_mm_tourn.loc[raw_mm_tourn['Season'] == season]
        raw_secondary_tourn_szn = raw_secondary_tourn.loc[raw_secondary_tourn['Season'] == season]
        postseason_games = pd.concat([raw_mm_tourn_szn, raw_secondary_tourn_szn], axis = 0, ignore_index=True)
    else:
        postseason_games = raw_mm_tourn.loc[raw_mm_tourn['Season'] == season]

    if include_conf_tourney_data:
        raw_conf_szn = raw_conf_tourn.loc[raw_conf_tourn['Season'] == season]
        postseason_games = pd.concat([postseason_games, raw_conf_szn], axis = 0, ignore_index=True)
        

    
        

    regszn_avgs_szn = pd.merge(left = regszn_avgs_szn, right = conf_data_szn[['TeamID', 'ConfAbbrev']], on = 'TeamID')

    team_feats_szn = pd.merge(left = regszn_avgs_szn, right = final_elo_df_szn[['elo', 'TeamID']], on = 'TeamID')
    

    all_pairwise_data = []
    all_pairwise_diffs_data = []
    winner_list = []


    for rownum, rowvals in postseason_games.iterrows():



        # team1 will always be the team with the lower team ID

        team_ids = [rowvals['WTeamID'], rowvals['LTeamID']]
        team1 = min(team_ids)
        team2 = max(team_ids)

        if team1 == rowvals['WTeamID']:
            winner = 0
        elif team2 == rowvals['WTeamID']:
            winner = 1
        
        team1_vals = team_feats_szn.loc[team_feats_szn['TeamID'] == team1].values.tolist()[0]
        team1_dict = dict(zip([f'{col}_1' for col in team_feats_szn.columns], team1_vals))

        team2_vals = team_feats_szn.loc[team_feats_szn['TeamID'] == team2].values.tolist()[0]
        team2_dict = dict(zip([f'{col}_2' for col in team_feats_szn.columns], team2_vals))

        diffs_dict = {}
        for team_feat in team1_dict.keys():
            
            # remove the _# so that we can use for team2 as well
            team_feat_stem = team_feat[:-2]

            # if this is a stat-related column, find the difference between team 1 and team 2
            if team_feat_stem in stat_feats:
                diffs_dict[f'{team_feat_stem}_diff'] = team1_dict[f'{team_feat_stem}_1'] - team2_dict[f'{team_feat_stem}_2']

            # if it's not a numeric stat column, add existing elements to dict
            else:
                diffs_dict[f'{team_feat_stem}_1'] = team1_dict[f'{team_feat_stem}_1']
                diffs_dict[f'{team_feat_stem}_2'] = team2_dict[f'{team_feat_stem}_2']
        
        if include_winner_in_x:
            diffs_dict.update({'winner': winner})
            team1_dict.update({'winner':winner})

        all_pairwise_diffs_data.append(diffs_dict)

        team1_dict.update(team2_dict)
        all_pairwise_data.append(team1_dict)

        
        winner_list.append(winner)

        


    pairwise_df_szn = pd.DataFrame(all_pairwise_data)
    pairwise_diffs_df_szn = pd.DataFrame(all_pairwise_diffs_data)

    # concat confs, then drop team ids and conference data
    concat_confs = pairwise_df_szn['ConfAbbrev_1']+ '_' + pairwise_df_szn['ConfAbbrev_2']
    pairwise_df_szn.insert(column = 'merged_conf', value = concat_confs, loc = pairwise_df_szn.shape[1])

    pairwise_df_szn.drop(columns = ['ConfAbbrev_1', 'ConfAbbrev_2', 'TeamID_1', 'TeamID_2'], inplace = True)

    # concat confs, then drop team ids and conference data
    concat_confs = pairwise_diffs_df_szn['ConfAbbrev_1']+ '_' + pairwise_diffs_df_szn['ConfAbbrev_2']
    pairwise_diffs_df_szn.insert(column = 'merged_conf', value = concat_confs, loc = pairwise_diffs_df_szn.shape[1])

    pairwise_diffs_df_szn.drop(columns = ['ConfAbbrev_1', 'ConfAbbrev_2', 'TeamID_1', 'TeamID_2'], inplace = True)

    return pairwise_df_szn, pairwise_diffs_df_szn, winner_list


def model_train_workflow(train_x, train_y, val_x, val_y, params_to_categorize = [], scaler = None, grid_search=True, 
                         model_type = 'xgb', one_hot_encode_cat = False):
    
    
    if len(params_to_categorize) > 0:
        for param in params_to_categorize:
            train_x[param] = train_x[param].astype('category')
            val_x[param] = val_x[param].astype('category')
    scale_cols = [col for col in train_x.columns if col not in params_to_categorize]


    if one_hot_encode_cat:
        # Define transformers
        preprocessor = ColumnTransformer([
            ('num', scaler, scale_cols),  
            ('cat', OneHotEncoder(handle_unknown='ignore'), params_to_categorize) 
        ], remainder='passthrough') 
    else:
        # Define transformers
        preprocessor = ColumnTransformer([
            ('num', scaler, scale_cols)  
        ], remainder='passthrough')  


    if model_type == 'xgb':
        clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True)

       

        if grid_search:

            # Create a pipeline with preprocessing and model
            pipeline = Pipeline([
                ('preprocessor', preprocessor),  
                ('xgb', clf)  
            ])

            # Define hyperparameter grid
            param_grid = {
                'xgb__n_estimators': [50, 100, 200],  
                'xgb__max_depth': [3, 5, 7],  
                'xgb__learning_rate': [0.01, 0.1, 0.2],  
            }

            # Set up GridSearchCV
            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                scoring='accuracy',
                cv=5,
                verbose=1,
                n_jobs=-1
            )

            # Fit the grid search
            grid_search.fit(train_x, train_y)

            # Best parameters and score
            print("Best Parameters:", grid_search.best_params_)
            print("Best Cross-Validation Score:", grid_search.best_score_)

            # Evaluate on test set
            best_model = grid_search.best_estimator_
        else:
            # Define an XGBoost classifier with default hyperparameters
            if one_hot_encode_cat:
                clf = xgb.XGBClassifier(
                    use_label_encoder=False, 
                    eval_metric='logloss', 
                    n_estimators=100, 
                    max_depth=5, 
                    learning_rate=0.1
                )

                
            else:
                clf = xgb.XGBClassifier(
                    use_label_encoder=True, 
                    eval_metric='logloss', 
                    n_estimators=100, 
                    max_depth=5, 
                    learning_rate=0.1
                )

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('xgb', clf)
            ])


            pipeline.fit(train_x, train_y)
            best_model = clf
    


    elif model_type == 'logreg':
        
        clf = LogisticRegression()


        if grid_search:
 
            
            # Create a pipeline with preprocessing and model
            pipeline = Pipeline([
                ('preprocessor', preprocessor),  
                ('logreg', clf)  
            ])

            # Define hyperparameter grid
            param_grid = {
                'logreg__C': [0.01, 0.1, 1, 10, 100],  
                'logreg__penalty': ['l1', 'l2'], 
                'logreg__solver': ['liblinear', 'saga'],
                'logreg__max_iter': [100, 500, 1000]  
            }

            # Set up GridSearchCV
            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                scoring='accuracy',
                cv=5,  
                verbose=1,
                n_jobs=-1
            )

            # Fit the grid search
            grid_search.fit(train_x, train_y)

            print("Best Parameters:", grid_search.best_params_)
            print("Best Cross-Validation Score:", grid_search.best_score_)

            best_model = grid_search.best_estimator_
        
        else:
            if one_hot_encode_cat:
                clf = LogisticRegression(solver = 'saga',
                                         penalty = 'l2')

                
            else:
                print('need one hot encoding for logistic regression')
                return

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('xgb', clf)
            ])


            pipeline.fit(train_x, train_y)
            best_model = clf


    pred_probs = best_model.predict_proba(val_x)
    preds = best_model.predict(val_x)
    class_1_probas = [probs[1] for probs in pred_probs]
    brier_loss = brier_score_loss(y_true = val_y, y_proba = class_1_probas)
    accuracy = accuracy_score(y_true = val_y, y_pred = preds)

    print("Brier loss:", brier_loss)
    print("Accuracy:", accuracy)

    if grid_search:
        return brier_loss, accuracy, grid_search.best_params_
    else:
        return brier_loss, accuracy, 


def full_workflow(k_dict, reg_szn_data_type, include_conf_res, include_secondary_res,
                  feature_sets_to_include, run_name, mods_to_include = ['xgb', 'logreg'],
                  scalers_to_include = [MinMaxScaler(), StandardScaler()],
                  m_or_w = m_or_w):
    
    '''
    if include_conf_res, we will NOT adjust elo in conference tournaments; we WILL use conf tourney results to train
    '''
    

    
    if m_or_w == 'MEN':
        prefix = 'M'
    elif m_or_w == 'WOMEN':
        prefix = 'W'
    raw_reg_szn = pd.read_csv(os.path.join(path_to_data, f'{prefix}RegularSeasonCompactResults.csv'))
    raw_conf_tourn = pd.read_csv(os.path.join(path_to_data, f'{prefix}ConferenceTourneyGames.csv'))
    
    


    # if reg_szn_data_type == 'BIGSZN':
    #     reg_szn_train_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'big_reg_szn_train.csv'), index_col = 0)
    #     reg_szn_val_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'big_reg_szn_val.csv'), index_col = 0)
        
    if reg_szn_data_type == 'DIFFS':
        
        if m_or_w == 'MEN':
            reg_szn_train_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'MENS_train_diffs.csv'), index_col= 0)
            reg_szn_val_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'MENS_val_diffs.csv'), index_col= 0)
        elif m_or_w == 'WOMEN':
            reg_szn_train_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'WOMENS_train_diffs.csv'), index_col= 0)
            reg_szn_val_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'WOMENS_val_diffs.csv'), index_col= 0)
    stat_feats = list(reg_szn_train_data.columns[2:])
    
    ######################################## training data
    train_list_of_nondiffs_dfs = []
    train_list_of_diffs_dfs = []
    train_list_of_ys = []

    for season in reg_szn_train_data['Season'].unique():

        team_elo_dict, final_elo_df = make_final_elo_df(year = season, score_result_df=raw_reg_szn, team_elo_dict=None, k_scheme=k_dict, fix_k = False)

        # we only want to perform conference tournament ELO adjustments if we ARE NOT using conference data as trainable results
        if not include_conf_res:
            postcon_team_elo_dict, postcon_final_elo_df = make_final_elo_df(year = season, score_result_df=raw_conf_tourn, team_elo_dict=team_elo_dict, k_scheme=k_dict, fix_k= True)
            x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = postcon_final_elo_df, reg_szn_data=reg_szn_train_data, 
                                                        stat_feats = stat_feats, include_winner_in_x = False, include_conf_tourney_data=False,
                                                        include_secondary_tourney_data=include_secondary_res)
        elif include_conf_res:  
            x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = final_elo_df, reg_szn_data=reg_szn_train_data, 
                                                            stat_feats = stat_feats, include_winner_in_x = False, include_conf_tourney_data=True,
                                                            include_secondary_tourney_data=include_secondary_res)

        train_list_of_nondiffs_dfs.append(x_pairwise)
        train_list_of_diffs_dfs.append(x_pairwise_diffs)
        train_list_of_ys.extend(y)

    full_nondiffs_df_train = pd.concat(train_list_of_nondiffs_dfs, axis = 0, ignore_index=True)
    full_diffs_df_train = pd.concat(train_list_of_diffs_dfs, axis = 0, ignore_index=True)
    full_y_df_train = pd.DataFrame(train_list_of_ys)

    ######################################### validation data
    val_list_of_nondiffs_dfs = []
    val_list_of_diffs_dfs = []
    val_list_of_ys = []

    for season in reg_szn_val_data['Season'].unique():

        team_elo_dict, final_elo_df = make_final_elo_df(year = season, score_result_df=raw_reg_szn, team_elo_dict=None, k_scheme=k_dict, fix_k = False)

        if not include_conf_res:
            postcon_team_elo_dict, postcon_final_elo_df = make_final_elo_df(year = season, score_result_df=raw_conf_tourn, team_elo_dict=team_elo_dict, k_scheme=k_dict, fix_k= True)

            x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = postcon_final_elo_df, reg_szn_data=reg_szn_val_data, stat_feats=stat_feats,
                                                            include_winner_in_x = False, include_conf_tourney_data=False,
                                                            include_secondary_tourney_data=include_secondary_res)
        elif include_conf_res:
            x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = final_elo_df, reg_szn_data=reg_szn_val_data, stat_feats=stat_feats,
                                                            include_winner_in_x = False, include_conf_tourney_data=True,
                                                            include_secondary_tourney_data=include_secondary_res)

        val_list_of_nondiffs_dfs.append(x_pairwise)
        val_list_of_diffs_dfs.append(x_pairwise_diffs)
        val_list_of_ys.extend(y)

    full_nondiffs_df_val = pd.concat(val_list_of_nondiffs_dfs, axis = 0, ignore_index=True)
    full_diffs_df_val = pd.concat(val_list_of_diffs_dfs, axis = 0, ignore_index=True)
    full_y_df_val = pd.DataFrame(val_list_of_ys)

    ######################################### model testing

    dataset_dict_list = []

    if 'barebones_merged_diffs' in feature_sets_to_include:

        # matchup diffs
        barebones_diffs_merged_train = full_diffs_df_train[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_diffs_merged_val = full_diffs_df_val[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_diffs_merged_data_dict = {'name':'barebones_merged_diffs',
                        'x_train':barebones_diffs_merged_train,
                        'x_val': barebones_diffs_merged_val,
                        'y_train':full_y_df_train,
                        'y_val':full_y_df_val}
        dataset_dict_list.append(barebones_diffs_merged_data_dict)

    if 'barebones_merged_nondiffs' in feature_sets_to_include:

        barebones_nondiffs_merged_train = full_nondiffs_df_train[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_nondiffs_merged_val = full_nondiffs_df_val[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_nondiffs_merged_data_dict = {'name':'barebones_merged_nondiffs',
                        'x_train':barebones_nondiffs_merged_train,
                        'x_val': barebones_nondiffs_merged_val,
                        'y_train':full_y_df_train,
                        'y_val':full_y_df_val}
        dataset_dict_list.append(barebones_nondiffs_merged_data_dict)


    if 'barebones_sepconf_diffs' in feature_sets_to_include:
        barebones_diffs_train_sepconf = full_diffs_df_train[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_diffs_train_sepconf[['conf_1', 'conf_2']] = barebones_diffs_train_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        barebones_diffs_train_sepconf.drop(columns = 'merged_conf', inplace = True)
        barebones_diffs_val_sepconf = full_diffs_df_val[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_diffs_val_sepconf[['conf_1', 'conf_2']] = barebones_diffs_val_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        barebones_diffs_val_sepconf.drop(columns = 'merged_conf', inplace = True)

        barebones_diffs_sepconf_data_dict = {'name': 'barebones_sepconf_diffs',
                        'x_train':barebones_diffs_train_sepconf,
                        'x_val': barebones_diffs_val_sepconf,
                        'y_train':full_y_df_train,
                        'y_val':full_y_df_val}
        dataset_dict_list.append(barebones_diffs_sepconf_data_dict)

    if 'barebones_sepconf_nondiffs' in feature_sets_to_include:

        barebones_nondiffs_train_sepconf = full_nondiffs_df_train[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_nondiffs_train_sepconf[['conf_1', 'conf_2']] = barebones_nondiffs_train_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        barebones_nondiffs_train_sepconf.drop(columns = 'merged_conf', inplace = True)
        barebones_nondiffs_val_sepconf = full_nondiffs_df_val[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
        barebones_nondiffs_val_sepconf[['conf_1', 'conf_2']] = barebones_nondiffs_val_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        barebones_nondiffs_val_sepconf.drop(columns = 'merged_conf', inplace = True)

        barebones_nondiffs_sepconf_data_dict = {'name': 'barebones_sepconf_nondiffs',
                        'x_train':barebones_nondiffs_train_sepconf,
                        'x_val': barebones_nondiffs_val_sepconf,
                        'y_train':full_y_df_train,
                        'y_val':full_y_df_val}
        dataset_dict_list.append(barebones_nondiffs_sepconf_data_dict)
    
    
    if 'full_diffs' in feature_sets_to_include:
        full_diffs_data_dict = {
            'name': f'{reg_szn_data_type}_full_diffs',
            'x_train':full_diffs_df_train,
            'x_val': full_diffs_df_val,
            'y_train':full_y_df_train,
            'y_val':full_y_df_val
        }
        dataset_dict_list.append(full_diffs_data_dict)

    if 'full_sepconf_diffs' in feature_sets_to_include:
        full_diffs_train_sepconf = full_diffs_df_train.copy(deep = True)
        full_diffs_train_sepconf[['conf_1', 'conf_2']] = full_diffs_train_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        full_diffs_train_sepconf.drop(columns = 'merged_conf', inplace = True)
        full_diffs_val_sepconf = full_diffs_df_val.copy(deep = True)
        full_diffs_val_sepconf[['conf_1', 'conf_2']] = full_diffs_val_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
        full_diffs_val_sepconf.drop(columns = 'merged_conf', inplace = True)

        full_sepconf_data_dict = {'name': 'full_sepconf_diffs',
                        'x_train':full_diffs_train_sepconf,
                        'x_val': full_diffs_val_sepconf,
                        'y_train':full_y_df_train,
                        'y_val':full_y_df_val}
        dataset_dict_list.append(full_sepconf_data_dict)

    if 'full_nondiffs' in feature_sets_to_include:
        full_nondiffs_data_dict = {
            'name': f'{reg_szn_data_type}_full_nondiffs',
            'x_train':full_nondiffs_df_train,
            'x_val': full_nondiffs_df_val,
            'y_train':full_y_df_train,
            'y_val':full_y_df_val
        }
        dataset_dict_list.append(full_nondiffs_data_dict)


    # want to ensure scaling steps are not overwriting the values in the dataframe ...
    # will save each df as csv then read it in within each iteration
    # will slow it down tremendously but that's ok for now

    growing_model_results_list = []

    onehot_allowed_dict = {'xgb':[True],
                            'logreg':[True]}
    
    # now iterate through each data dictionary and build multiple models for each set of data dict data
    for data_dict in dataset_dict_list:
        
        this_datadict_path = os.path.join(path_to_data, 'custom_datasets', 'grid_search_res', run_name, k_dict['name'], reg_szn_data_type, data_dict['name'])
        if not os.path.exists(this_datadict_path):
            os.makedirs(this_datadict_path)
        
    #     
        data_dict['x_train'].to_csv(os.path.join(this_datadict_path, 'x_train.csv'), index = False)
        data_dict['x_val'].to_csv(os.path.join(this_datadict_path, 'x_val.csv'), index = False)
        data_dict['y_train'].to_csv(os.path.join(this_datadict_path, 'y_train.csv'), index = False)
        data_dict['y_val'].to_csv(os.path.join(this_datadict_path, 'y_val.csv'), index = False)
    
        for this_scaler in scalers_to_include:
            for this_model in mods_to_include:
                for this_onehot in onehot_allowed_dict[this_model]:
                    
                    x_train = pd.read_csv(os.path.join(this_datadict_path, 'x_train.csv'))
                    x_val = pd.read_csv(os.path.join(this_datadict_path, 'x_val.csv'))
                    y_train = pd.read_csv(os.path.join(this_datadict_path, 'y_train.csv'))
                    y_val = pd.read_csv(os.path.join(this_datadict_path, 'y_val.csv'))
                    
                    raveled_y_train = y_train.values.ravel()
                    raveled_y_val = y_val.values.ravel()

                    # this will dictate which cols are categorized
                    if 'sepconf' in data_dict['name']:
                        brier, acc, params = model_train_workflow(train_x = x_train, train_y = raveled_y_train, 
                                            val_x = x_val, val_y = raveled_y_val, 
                                            params_to_categorize = ['conf_1', 'conf_2'], 
                                            scaler = this_scaler, grid_search=True,
                                            model_type=this_model, one_hot_encode_cat=this_onehot)
                    else:
                        brier, acc, params = model_train_workflow(train_x = x_train, train_y = raveled_y_train, 
                                            val_x = x_val, val_y = raveled_y_val, 
                                            params_to_categorize = ['merged_conf'], 
                                            scaler = this_scaler, grid_search=True,
                                            model_type=this_model, one_hot_encode_cat=this_onehot)
                    
                    this_param_combo_dict = {'scaler': this_scaler,
                                             'model': this_model,
                                             'onehot': this_onehot}

                    this_mod_results_list = [k_dict['name'], data_dict['name'],  str(data_dict), str(this_param_combo_dict), str(params), brier, acc]
                    growing_model_results_list.append(this_mod_results_list)
    
    res_df = pd.DataFrame(growing_model_results_list)


    res_df.to_csv(os.path.join(path_to_data, 'custom_datasets', 'grid_search_res', run_name, k_dict['name'], reg_szn_data_type, 'grid_search_res.csv'))

                    
        
def make_2025_x():

    # create the test X dataframe:
    all_team_ids_2025 = [int(teamid) for teamid in reg_szn_diffs_2025_data['TeamID'].unique()]
    pairwise_teams_2025 = list(itertools.combinations(all_team_ids_2025, 2))
    stat_feats = list(reg_szn_diffs_2025_data.columns[2:])
    conf_data_szn = conf_data.loc[conf_data['Season'] == 2025]
    regszn_avgs_szn = reg_szn_diffs_2025_data.groupby('TeamID')[stat_feats].mean()


    opt_k_dict = {'ooc':20,
                'conf': 40}
    team_elo_dict, final_elo_df = make_final_elo_df(year = 2025, score_result_df=raw_reg_szn, team_elo_dict=None, 
                                                    k_scheme=opt_k_dict, fix_k = False)

    # merge and set default noconf values for the schools missing conf data
    regszn_avgs_szn = reg_szn_diffs_2025_data.groupby('TeamID')[stat_feats].mean()
    regszn_avgs_szn = pd.merge(left = regszn_avgs_szn, right = conf_data_szn[['TeamID', 'ConfAbbrev']], 
                            on = 'TeamID', how = 'left')
    regszn_avgs_szn['ConfAbbrev'] = regszn_avgs_szn['ConfAbbrev'].fillna('noconf')
    team_feats_szn = pd.merge(left = regszn_avgs_szn, right = final_elo_df[['elo', 'TeamID']], on = 'TeamID')

    all_pairwise_data = []
    all_pairwise_diffs_data = []

    for team1, team2 in tqdm(pairwise_teams_2025):

        team1_vals = team_feats_szn.loc[team_feats_szn['TeamID'] == team1].values.tolist()[0]
        team1_dict = dict(zip([f'{col}_1' for col in team_feats_szn.columns], team1_vals))

        team2_vals = team_feats_szn.loc[team_feats_szn['TeamID'] == team2].values.tolist()[0]
        team2_dict = dict(zip([f'{col}_2' for col in team_feats_szn.columns], team2_vals))

        diffs_dict = {}
        for team_feat in team1_dict.keys():
            
            # remove the _# so that we can use for team2 as well
            team_feat_stem = team_feat[:-2]

            # if this is a stat-related column, find the difference between team 1 and team 2
            if team_feat_stem in stat_feats:
                diffs_dict[f'{team_feat_stem}_diff'] = team1_dict[f'{team_feat_stem}_1'] - team2_dict[f'{team_feat_stem}_2']

            # if it's not a numeric stat column, add existing elements to dict
            else:
                diffs_dict[f'{team_feat_stem}_1'] = team1_dict[f'{team_feat_stem}_1']
                diffs_dict[f'{team_feat_stem}_2'] = team2_dict[f'{team_feat_stem}_2']

        all_pairwise_diffs_data.append(diffs_dict)

        team1_dict.update(team2_dict)
        all_pairwise_data.append(team1_dict)




    pairwise_df_szn = pd.DataFrame(all_pairwise_data)
    pairwise_diffs_df_szn = pd.DataFrame(all_pairwise_diffs_data)

    # concat confs, then drop team ids and conference data
    concat_confs = pairwise_df_szn['ConfAbbrev_1']+ '_' + pairwise_df_szn['ConfAbbrev_2']
    pairwise_df_szn.insert(column = 'merged_conf', value = concat_confs, loc = pairwise_df_szn.shape[1])

    pairwise_df_szn.drop(columns = ['ConfAbbrev_1', 'ConfAbbrev_2', 'TeamID_1', 'TeamID_2'], inplace = True)

    # concat confs, then drop team ids and conference data
    concat_confs = pairwise_diffs_df_szn['ConfAbbrev_1']+ '_' + pairwise_diffs_df_szn['ConfAbbrev_2']
    pairwise_diffs_df_szn.insert(column = 'merged_conf', value = concat_confs, loc = pairwise_diffs_df_szn.shape[1])

    pairwise_diffs_df_szn.drop(columns = ['ConfAbbrev_1', 'ConfAbbrev_2', 'TeamID_1', 'TeamID_2'], inplace = True)

    return pairwise_df_szn, pairwise_diffs_df_szn


def probs_to_df(savename, pairwise_probs, reg_szn_data = reg_szn_diffs_2025_data):

    all_team_ids_2025 = [int(teamid) for teamid in reg_szn_data['TeamID'].unique()]
    pairwise_teams_2025 = list(itertools.combinations(all_team_ids_2025, 2))

    growing_list_of_probs = []

    assert len(pairwise_teams_2025) == len(pairwise_probs)
    for obs_num in range(len(pairwise_teams_2025)):
        team1, team2 = pairwise_teams_2025[obs_num]
        team1_winprob, team2_winprob = pairwise_probs[obs_num]
        this_list = [team1, team1_winprob, team2, team2_winprob]
        growing_list_of_probs.append(this_list)

    pairwise_winprob_2025_df = pd.DataFrame(growing_list_of_probs)
    pairwise_winprob_2025_df.columns = ['team1', 'winprob1', 'team2', 'winprob2']
    pairwise_winprob_2025_df.to_csv(os.path.join(path_to_data, 'custom_datasets', savename), index=False)

    return pairwise_winprob_2025_df




def prob_df_to_bracket_results(savename,
                               pairwise_winprob_df,
                               tourney_seeds = tourney_seeds,
                               tourney_slots = tourney_slots):
    

    tourney_seeds_2025 = tourney_seeds.loc[tourney_seeds['Season'] == 2025].reset_index(drop = True)
    tourney_slots_2025 = tourney_slots.loc[tourney_slots['Season'] == 2025].reset_index(drop = True)

    # identify the play-in round games by their row number
    play_in_rownums = list(tourney_slots_2025.loc[tourney_slots_2025['StrongSeed'].str.endswith('a')].index)

    # move these rows to the top

    tourney_slots_2025 = pd.concat([tourney_slots_2025.iloc[play_in_rownums], tourney_slots_2025.drop(index=play_in_rownums)], axis = 0)

    tourney_seeds_2025_dict = dict(zip(tourney_seeds_2025['Seed'], tourney_seeds_2025['TeamID']))

    readable_matchup_winner_list = []

    for rownum, rowvals in tqdm(tourney_slots_2025.iterrows()):

        teama_seed = rowvals['StrongSeed']
        teama_id = tourney_seeds_2025_dict[teama_seed]
        teama_school = team_data['TeamName'].loc[team_data['TeamID'] == teama_id].values[0]

        teamb_seed = rowvals['WeakSeed']
        teamb_id = tourney_seeds_2025_dict[teamb_seed]
        teamb_school = team_data['TeamName'].loc[team_data['TeamID'] == teamb_id].values[0]

        winner_slot = rowvals['Slot']

        rownum = int(pairwise_winprob_df.loc[pairwise_winprob_df.apply(lambda row: frozenset([row['team1'], row['team2']]) == frozenset([teama_id, teamb_id]), axis = 1)].index.values[0])
        this_row = pairwise_winprob_df.iloc[rownum,:]

        if this_row['team1'] == teama_id:

            teama_winprob = this_row['winprob1']
            teamb_winprob = this_row['winprob2']

            
        elif this_row['team1'] == teamb_id:

            teama_winprob = this_row['winprob2']
            teamb_winprob = this_row['winprob1']

        
        if teama_winprob > teamb_winprob:
            winner = teama_id
        elif teamb_winprob > teama_winprob:
            winner = teamb_id

        winner_school = team_data['TeamName'].loc[team_data['TeamID'] == winner].values[0]

        # update the seed dict with the winner adopting the new slot
        tourney_seeds_2025_dict[winner_slot] = winner
        
        this_matchup_entry = [teama_school, teamb_school, winner_school, np.round(teama_winprob, 3), np.round(teamb_winprob, 3)]

        readable_matchup_winner_list.append(this_matchup_entry)




    bracket_res_df = pd.DataFrame(readable_matchup_winner_list)
    bracket_res_df.to_csv(os.path.join(path_to_data, 'custom_datasets', savename), index = False)

    return bracket_res_df


def train_optimal_logreg_mod(savename_stem,
                            opt_k_dict,
                            clf_param_dict,
                            feature_set_to_include,
                            opt_scaler = StandardScaler(),
                            m_or_w = m_or_w):

    # train optimal mod:
    if m_or_w == 'MEN':
        prefix = 'MENS'
    elif m_or_w == 'WOMEN':
        prefix = 'WOMENS'

    train_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', f'{prefix}_train_diffs.csv'), index_col= 0)
    val_reg_szn_diffs_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', f'{prefix}_val_diffs.csv'), index_col= 0)
    reg_szn_diffs_2025_data = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', f'{prefix}_test_diffs.csv'), index_col= 0)
    

    stat_feats = list(reg_szn_diffs_2025_data.columns[2:])

    train_list_of_nondiffs_dfs = []
    train_list_of_diffs_dfs = []
    train_list_of_ys = []

    for season in train_reg_szn_diffs_data['Season'].unique():

        team_elo_dict, final_elo_df = make_final_elo_df(year = season, score_result_df=raw_reg_szn, team_elo_dict=None, k_scheme=opt_k_dict, fix_k = False)



        x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = final_elo_df, reg_szn_data=train_reg_szn_diffs_data, 
                                                        stat_feats = stat_feats, include_winner_in_x = False, include_conf_tourney_data=True,
                                                        include_secondary_tourney_data=False)

        train_list_of_nondiffs_dfs.append(x_pairwise)
        train_list_of_diffs_dfs.append(x_pairwise_diffs)
        train_list_of_ys.extend(y)

    full_nondiffs_df_train = pd.concat(train_list_of_nondiffs_dfs, axis = 0, ignore_index=True)
    full_diffs_df_train = pd.concat(train_list_of_diffs_dfs, axis = 0, ignore_index=True)
    full_y_df_train = pd.DataFrame(train_list_of_ys)

    barebones_diffs_train_sepconf = full_diffs_df_train[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
    barebones_diffs_train_sepconf[['conf_1', 'conf_2']] = barebones_diffs_train_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    barebones_diffs_train_sepconf.drop(columns = 'merged_conf', inplace = True)

    full_diffs_train_sepconf = full_diffs_df_train.copy(deep = True)
    full_diffs_train_sepconf[['conf_1', 'conf_2']] = full_diffs_train_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    full_diffs_train_sepconf.drop(columns = 'merged_conf', inplace = True)


    ######################################### validation data
    val_list_of_nondiffs_dfs = []
    val_list_of_diffs_dfs = []
    val_list_of_ys = []

    for season in val_reg_szn_diffs_data['Season'].unique():

        team_elo_dict, final_elo_df = make_final_elo_df(year = season, score_result_df=raw_reg_szn, team_elo_dict=None, k_scheme=opt_k_dict, fix_k = False)

        x_pairwise, x_pairwise_diffs, y = create_x_y_data(season = season, final_elo_df_szn = final_elo_df, reg_szn_data=val_reg_szn_diffs_data, stat_feats=stat_feats,
                                                        include_winner_in_x = False, include_conf_tourney_data=True,
                                                        include_secondary_tourney_data=False)

        val_list_of_nondiffs_dfs.append(x_pairwise)
        val_list_of_diffs_dfs.append(x_pairwise_diffs)
        val_list_of_ys.extend(y)

    full_nondiffs_df_val = pd.concat(val_list_of_nondiffs_dfs, axis = 0, ignore_index=True)
    full_diffs_df_val = pd.concat(val_list_of_diffs_dfs, axis = 0, ignore_index=True)
    full_y_df_val = pd.DataFrame(val_list_of_ys)

    barebones_diffs_val_sepconf = full_diffs_df_val[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
    barebones_diffs_val_sepconf[['conf_1', 'conf_2']] = barebones_diffs_val_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    barebones_diffs_val_sepconf.drop(columns = 'merged_conf', inplace = True)

    full_diffs_val_sepconf = full_diffs_df_val.copy(deep = True)
    full_diffs_val_sepconf[['conf_1', 'conf_2']] = full_diffs_val_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    full_diffs_val_sepconf.drop(columns = 'merged_conf', inplace = True)


    ######################################### 2025 data
    full_nondiffs_df_test, full_diffs_df_test = make_2025_x()

    barebones_diffs_test_sepconf = full_diffs_df_test[['elo_1', 'elo_2', 'merged_conf']].copy(deep = True)
    barebones_diffs_test_sepconf[['conf_1', 'conf_2']] = barebones_diffs_test_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    barebones_diffs_test_sepconf.drop(columns = 'merged_conf', inplace = True)

    full_diffs_test_sepconf = full_diffs_df_test.copy(deep = True)
    full_diffs_test_sepconf[['conf_1', 'conf_2']] = full_diffs_test_sepconf['merged_conf'].str.split('_', n = 1, expand = True)
    full_diffs_test_sepconf.drop(columns = 'merged_conf', inplace = True)


    ######################################### modeling
    
    clf = LogisticRegression(**clf_param_dict)

    if 'sepconf' in feature_set_to_include:
        params_to_categorize = ['conf_1', 'conf_2']
    else:
        params_to_categorize = ['merged_conf']

    

    if feature_set_to_include == 'full_sepconf_diffs':

        for param in params_to_categorize:
            full_diffs_test_sepconf[param] = full_diffs_test_sepconf[param].astype('category')

        scale_cols = [col for col in full_diffs_test_sepconf.columns if col not in params_to_categorize]

        preprocessor = ColumnTransformer([
            ('num', opt_scaler, scale_cols), 
            ('cat', OneHotEncoder(handle_unknown='ignore'), params_to_categorize)
        ], remainder='passthrough')  
            
        pipeline = Pipeline([
            ('preprocessor', preprocessor), 
            ('logreg', clf)  
        ])


        pipeline.fit(full_diffs_train_sepconf, full_y_df_train.values.ravel())
        pairwise_probs_2025 = pipeline.predict_proba(full_diffs_test_sepconf)

    elif feature_set_to_include == 'barebones_diffs_sepconf':

        for param in params_to_categorize:
            barebones_diffs_test_sepconf[param] = barebones_diffs_test_sepconf[param].astype('category')

        scale_cols = [col for col in barebones_diffs_test_sepconf.columns if col not in params_to_categorize]

        preprocessor = ColumnTransformer([
            ('num', opt_scaler, scale_cols), 
            ('cat', OneHotEncoder(handle_unknown='ignore'), params_to_categorize)
        ], remainder='passthrough')  
            
        pipeline = Pipeline([
            ('preprocessor', preprocessor), 
            ('logreg', clf)  
        ])

        pipeline.fit(barebones_diffs_train_sepconf, full_y_df_train.values.ravel())
        pairwise_probs_2025 = pipeline.predict_proba(barebones_diffs_test_sepconf)

    elif feature_set_to_include == 'full_diffs':

        for param in params_to_categorize:
            full_diffs_df_test[param] = full_diffs_df_test[param].astype('category')

        scale_cols = [col for col in full_diffs_df_test.columns if col not in params_to_categorize]

        preprocessor = ColumnTransformer([
            ('num', opt_scaler, scale_cols), 
            ('cat', OneHotEncoder(handle_unknown='ignore'), params_to_categorize)
        ], remainder='passthrough')  
            
        pipeline = Pipeline([
            ('preprocessor', preprocessor), 
            ('logreg', clf)  
        ])
        
        pipeline.fit(full_diffs_df_train, full_y_df_train.values.ravel())
        pairwise_probs_2025 = pipeline.predict_proba(full_diffs_df_test)

    pairwise_winprob_2025_df = probs_to_df(savename = f'{savename_stem}_{m_or_w}_pairwise_probs_2025.csv', pairwise_probs = pairwise_probs_2025, reg_szn_data = reg_szn_diffs_2025_data)

    bracket_res_df = prob_df_to_bracket_results(savename = f'{savename_stem}_{m_or_w}_bracket_res_df.csv',
                                                pairwise_winprob_df = pairwise_winprob_2025_df,
                                                tourney_seeds = tourney_seeds,
                                                tourney_slots = tourney_slots)
    
    return pairwise_winprob_2025_df, bracket_res_df


def grid_search(conf_game_k_vals,
                ooc_game_k_vals,
                fixed_game_k_vals,
                run_name_stem,
                this_include_conf_res = True,
                this_include_secondary_res = False,
                these_scalers_to_include = [StandardScaler()],
                these_feature_sets_to_include = ['full_sepconf_diffs'],
                these_mods_to_include = ['logreg'],
                this_reg_szn_data_type = 'DIFFS'):
    
    '''
    if include_conf_res, we will NOT adjust ELO from conference tournament results
    '''

    # grid searching for optimal params

    dict_combos = list(itertools.product(conf_game_k_vals, ooc_game_k_vals, fixed_game_k_vals))
    keys = ['conf', 'ooc', 'fixed']
    list_of_k_schemes = [dict(zip(keys, dict_combo)) for dict_combo in dict_combos] 

    for dict_num, k_dict in enumerate(list_of_k_schemes):
        k_dict['name'] = f'k_{run_name_stem}_{dict_num}'

    k_scheme_df = pd.DataFrame(list_of_k_schemes)

    k_scheme_dir_path = os.path.join(path_to_data, 'custom_datasets', 'k_schemes')

    if not os.path.exists(k_scheme_dir_path):
        os.makedirs(k_scheme_dir_path)
    k_scheme_df.to_csv(os.path.join(k_scheme_dir_path, f'gridsearch_{run_name_stem}.csv'))

    for dict_num, this_k_dict in enumerate(list_of_k_schemes):
    
        # full_workflow(this_k_dict, data_type)
        full_workflow(k_dict = this_k_dict, 
                    reg_szn_data_type = this_reg_szn_data_type, 
                    include_conf_res = this_include_conf_res, 
                    include_secondary_res = this_include_secondary_res,
                    feature_sets_to_include = these_feature_sets_to_include,
                    mods_to_include = these_mods_to_include,
                    scalers_to_include = these_scalers_to_include,
                    run_name = run_name_stem)
    
    print(f'finished dict {dict_num} / {len(list_of_k_schemes)}')



            


In [16]:
################## one submission will be full_sepconf for both men's and women's:

In [17]:
# ### FULL_SEPCONF (SUBMISSION 1), MEN'S

# # optimal params from men's grid search ...

# this_opt_k_dict = {'ooc':15,
#               'conf': 40}
# these_opt_clf_params = {'solver': 'liblinear',
#                         'penalty': 'l2',
#                         'max_iter': 100, 
#                         'C': 1,
#                         'random_state': 42}

# mens_opt_full_pairwise_winprob_2025_df, mens_opt_full_bracket_res_df = train_optimal_logreg_mod(savename_stem = 'Mopt_full_sepconf_standard',
#                                                                                                     opt_k_dict = this_opt_k_dict,
#                                                                                                     clf_param_dict = these_opt_clf_params,
#                                                                                                     feature_set_to_include = 'full_sepconf_diffs',
#                                                                                                     opt_scaler=StandardScaler(),
#                                                                                                     m_or_w = m_or_w)

In [18]:
#### FULL_SEPCONF (SUBMISSION 1), WOMEN'S

# optimal params from women's grid search ...

this_opt_k_dict = {'ooc':20,
                   'conf': 10}
these_opt_clf_params = {'solver': 'liblinear',
                        'penalty': 'l2',
                        'max_iter': 100, 
                        'C': 10,
                        'random_state': 42}

womens_opt_full_pairwise_winprob_2025_df, womens_opt_full_bracket_res_df = train_optimal_logreg_mod(savename_stem = 'Wopt_full_sepconf_diffs_minmax',
                                                                                            opt_k_dict = this_opt_k_dict,
                                                                                            clf_param_dict = these_opt_clf_params,
                                                                                            feature_set_to_include = 'full_sepconf_diffs',
                                                                                            opt_scaler=MinMaxScaler(),
                                                                                            m_or_w = m_or_w)

5140it [00:02, 2265.17it/s]
5138it [00:02, 2211.51it/s]
5184it [00:02, 2276.32it/s]
5252it [00:02, 2192.31it/s]
5214it [00:02, 2328.74it/s]
5210it [00:02, 2175.77it/s]
5209it [00:02, 2237.43it/s]
3556it [00:01, 2214.15it/s]
5060it [00:02, 2284.63it/s]
5414it [00:02, 2290.95it/s]
5114it [00:02, 2291.53it/s]
5209it [00:02, 2293.19it/s]
5240it [00:02, 2263.39it/s]
5374it [00:02, 2319.11it/s]
5444it [00:02, 2301.39it/s]
100%|██████████| 65341/65341 [00:24<00:00, 2715.95it/s]
67it [00:29,  2.24it/s]


In [19]:
################## the other submission will be barebones_sepconf for both men's and women's:

In [20]:
# #### BAREBONES_SEPCONF (SUBMISSION 2), MEN'S

# # optimal params from men's grid search ...

# this_opt_k_dict = {'ooc':20,
#                    'conf': 40}
# these_opt_clf_params = {'solver': 'liblinear',
#                         'penalty': 'l2',
#                         'max_iter': 100, 
#                         'C': 1,
#                         'random_state': 42}

# mens_opt_bb_pairwise_winprob_2025_df, mens_opt_bb_bracket_res_df = train_optimal_logreg_mod(savename_stem = 'Mopt_barebones_sepconf_diffs_standard',
#                                                                                             opt_k_dict = this_opt_k_dict,
#                                                                                             clf_param_dict = these_opt_clf_params,
#                                                                                             feature_set_to_include = 'barebones_diffs_sepconf',
#                                                                                             opt_scaler=StandardScaler(),
#                                                                                             m_or_w = m_or_w)

In [21]:
#### BAREBONES_SEPCONF (SUBMISSION 2), WOMEN'S

# optimal params from women's grid search ...

this_opt_k_dict = {'ooc':15,
                   'conf': 10}
these_opt_clf_params = {'solver': 'liblinear',
                        'penalty': 'l2',
                        'max_iter': 100, 
                        'C': 1,
                        'random_state': 42}

womens_opt_bb_pairwise_winprob_2025_df, womens_opt_bb_bracket_res_df = train_optimal_logreg_mod(savename_stem = 'Wopt_barebones_sepconf_diffs_minmax',
                                                                                            opt_k_dict = this_opt_k_dict,
                                                                                            clf_param_dict = these_opt_clf_params,
                                                                                            feature_set_to_include = 'barebones_diffs_sepconf',
                                                                                            opt_scaler=MinMaxScaler(),
                                                                                            m_or_w = m_or_w)


5140it [00:02, 2299.02it/s]
5138it [00:02, 2316.85it/s]
5184it [00:02, 2288.79it/s]
5252it [00:02, 2306.27it/s]
5214it [00:02, 2301.00it/s]
5210it [00:02, 2307.46it/s]
5209it [00:02, 2312.33it/s]
3556it [00:01, 2274.15it/s]
5060it [00:02, 2106.32it/s]
5414it [00:02, 2270.21it/s]
5114it [00:02, 2265.58it/s]
5209it [00:02, 2212.94it/s]
5240it [00:02, 2274.70it/s]
5374it [00:02, 2288.63it/s]
5444it [00:02, 2241.47it/s]
100%|██████████| 65341/65341 [00:25<00:00, 2590.61it/s]
67it [00:32,  2.08it/s]


In [22]:
def make_submission_format(pairwise_winprob_df):

    # growing_winprob_dict will be a dict of dicts
    growing_winprob_dict = {}
    sorted_team1 = pairwise_winprob_df.sort_values(by=['team1', 'team2'], ascending=[True, True])
    for rownum, rowvals in tqdm(sorted_team1.iterrows()):

        team1 = int(rowvals['team1'])
        team2 = int(rowvals['team2'])

        smaller_team = min([team1, team2])
        larger_team = max([team1, team2])

        # "You must predict the probability that the team with the lower TeamId beats the team with the higher TeamId"
        if smaller_team == team1:
            win_prob = rowvals['winprob1']
        elif smaller_team == team2:
            win_prob = rowvals['winprob2']
        
        if smaller_team in growing_winprob_dict.keys():
            growing_winprob_dict[smaller_team][larger_team] = win_prob
        elif smaller_team not in growing_winprob_dict.keys():
            growing_winprob_dict[smaller_team] = {larger_team:win_prob}

    # convert dict of dicts into a dataframe
    rows = [(smaller_team_id, larger_team_id, smaller_team_winprob) 
            for smaller_team_id, inner_dict in growing_winprob_dict.items() 
            for larger_team_id, smaller_team_winprob in inner_dict.items()]

    unsorted_df = pd.DataFrame(rows, columns = ['smaller_id', 'larger_id', 'Pred'])
    sorted_df = unsorted_df.sort_values(['smaller_id', 'larger_id'])

    id_col_vals = sorted_df.apply(lambda row: f'2025_{int(row['smaller_id'])}_{int(row['larger_id'])}', axis = 1)

    sorted_df.drop(columns = ['smaller_id', 'larger_id'], inplace=True)
    sorted_df.insert(loc = 0, column = 'ID', value = id_col_vals)

    return sorted_df
        

In [23]:
# prepare submission file:
# assumes both the men's and women's data have been generated already:

FULL_mens_opt_pairwise_winprob_2025_df = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'Mopt_full_sepconf_standard_MEN_pairwise_probs_2025.csv'), index_col=False)
FULL_womens_opt_pairwise_winprob_2025_df = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'Wopt_full_sepconf_diffs_minmax_WOMEN_pairwise_probs_2025.csv'), index_col=False)

BAREBONES_mens_opt_pairwise_winprob_2025_df = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'Mopt_barebones_sepconf_diffs_standard_MEN_pairwise_probs_2025.csv'), index_col=False)
BAREBONES_womens_opt_pairwise_winprob_2025_df = pd.read_csv(os.path.join(path_to_data, 'custom_datasets', 'Wopt_barebones_sepconf_diffs_minmax_WOMEN_pairwise_probs_2025.csv'), index_col=False)


mens_full_optim = make_submission_format(FULL_mens_opt_pairwise_winprob_2025_df)
womens_full_optim = make_submission_format(FULL_womens_opt_pairwise_winprob_2025_df)

mens_barebones_optim = make_submission_format(BAREBONES_mens_opt_pairwise_winprob_2025_df)
womens_barebones_optim = make_submission_format(BAREBONES_womens_opt_pairwise_winprob_2025_df)


66066it [00:01, 39377.53it/s]
65341it [00:01, 40042.09it/s]
66066it [00:01, 41531.35it/s]
65341it [00:01, 40004.73it/s]


In [46]:
# create the submission csvs

full_submission1_df = pd.concat([mens_full_optim, womens_full_optim], axis = 0).reset_index(drop = True)
full_submission1_df.to_csv(os.path.join(path_to_data, 'custom_datasets', 'full_submission1.csv'), index = False, header=True)

bb_submission2_df = pd.concat([mens_barebones_optim, womens_barebones_optim], axis = 0).reset_index(drop = True)
bb_submission2_df.to_csv(os.path.join(path_to_data, 'custom_datasets', 'bb_submission2.csv'), index = False, header=True)

In [53]:
# verfiy that the IDs are the same between the example stage2 submission csv and the csvs generated above
sample_submission = pd.read_csv(os.path.join(path_to_data, 'SampleSubmissionStage2.csv'))

full_num_differences = (full_submission1_df['ID'] != sample_submission['ID']).sum()
print(full_num_differences)

bb_num_differences = (bb_submission2_df['ID'] != sample_submission['ID']).sum()
print(bb_num_differences)

print(full_submission1_df.shape)
print(bb_submission2_df.shape)
print(sample_submission.shape)

0
0
(131407, 2)
(131407, 2)
(131407, 2)


In [None]:
# should be the end

In [None]:
# fun functions that aren't used in the final analyses but helped with EDA

def find_slot_path(seed, tourney_slots):
    # championship slot ends in CH
    championship_slot = tourney_slots['Slot'][tourney_slots['Slot'].str.endswith('CH')].values[0]
    next_slot = tourney_slots['Slot'].loc[(tourney_slots['StrongSeed'] == seed) | (tourney_slots['WeakSeed'] == seed)].values[0]
    poss_slots = [next_slot]
    while next_slot != championship_slot:
        next_slot = tourney_slots['Slot'].loc[(tourney_slots['StrongSeed'] == next_slot) | (tourney_slots['WeakSeed'] == next_slot)].values[0]
        poss_slots.append(next_slot)
    return poss_slots

def games_before_meeting(seed1, seed2, tourney_slots):
    slot_path1 = find_slot_path(seed1, tourney_slots)
    slot_path2 = find_slot_path(seed2, tourney_slots)

    for ind1, slot1 in enumerate(slot_path1):
        for ind2, slot2 in enumerate(slot_path2):
            if slot1 == slot2:
                return ind1, ind2
            
def create_tournament_matchup_dict(season):
    '''
    Was used for intermediate analyses; not included in final workflow
    '''
    season_seeds = tourney_seeds.loc[tourney_seeds['Season'] == season]
    seed_team_dict = dict(zip(season_seeds['Seed'], season_seeds['TeamID']))

    season_tourney_slots = tourney_slots.loc[tourney_slots['Season'] == season]
    season_tourney_results = raw_mm_tourn.loc[raw_mm_tourn['Season'] == season]

    round_matchup_dict = {}
    for rownum, rowvals in season_tourney_slots.iterrows():
        game_slot = rowvals['Slot']
        strong_seed, weak_seed = season_tourney_slots[['StrongSeed', 'WeakSeed']].loc[season_tourney_slots['Slot'] == game_slot].values.tolist()[0]

        try:
            team1 = seed_team_dict[strong_seed]
            team2 = seed_team_dict[weak_seed]

        except KeyError as e: 

            # account for the fact that there might be play-in games ... 
            # assign the non-a/b listed seed as the winner of the a vs b game

            if strong_seed not in seed_team_dict.keys():
                problematic_seed = strong_seed
            elif weak_seed not in seed_team_dict.keys():
                problematic_seed = weak_seed
        
            # strip away the a and b, get both combos, search for winner of that game ... 
            play_in_team_a = seed_team_dict[f'{problematic_seed}a']
            play_in_team_b = seed_team_dict[f'{problematic_seed}b']

            play_in_winner = int(season_tourney_results.loc[season_tourney_results.apply(lambda row: frozenset([row['WTeamID'], row['LTeamID']]) == frozenset([play_in_team_a, play_in_team_b]), axis = 1), 'WTeamID'].values[0])

            # add this play_in_winner as the official seed in the dict
            seed_team_dict[problematic_seed] = play_in_winner

            team1 = seed_team_dict[strong_seed]
            team2 = seed_team_dict[weak_seed]

        winner = int(season_tourney_results.loc[season_tourney_results.apply(lambda row: frozenset([row['WTeamID'], row['LTeamID']]) == frozenset([team1, team2]), axis = 1), 'WTeamID'].values[0])

        # add the winner as a new entry with the slot name as key
        seed_team_dict[game_slot] = winner

        round_num = int(seed_slots['GameRound'].loc[seed_slots['GameSlot'] == game_slot].values[0])
        if round_num in round_matchup_dict.keys():
            round_matchup_dict[round_num].append([team1, team2, winner])
        else:
            round_matchup_dict[round_num] = [[team1, team2, winner]]

    return round_matchup_dict



def find_brier_score(tourney_results, pairwise_matchup_probs):
    '''
    get the brier score for a given set of matchup probs and tournament results
    '''
    running_error = 0
    running_matchup_counter = 0

    for round_num,matchup_list in tourney_results.items():
        for matchup in matchup_list:
            winner = matchup[2]
            teams = set(matchup)
            loser = list(teams.difference(set([winner])))[0]
            outcome_prob = pairwise_matchup_probs[winner][loser]

            squared_loss = (1-outcome_prob)**2
            running_error += squared_loss
            running_matchup_counter += 1

    return running_error / running_matchup_counter


def prelim_analysis(year, k_dict, home_court_advantage = 0):


    team_elo_dict, final_elo_df = make_final_elo_df(year = year, score_result_df=raw_reg_szn, home_team_adjustment = home_court_advantage, 
                                                    team_elo_dict=None, k_scheme=k_dict, fix_k = False)

    postcon_team_elo_dict, postcon_final_elo_df = make_final_elo_df(year = year, score_result_df=raw_conf_tourn, home_team_adjustment = 0, 
                                                                    team_elo_dict=team_elo_dict, k_scheme=k_dict, fix_k= True)

    final_postcon_team_elo_dict = {team:elo[-1] for team, elo in postcon_team_elo_dict.items()}

    round_matchup_dict_2024 = create_tournament_matchup_dict(2024)

    pairwise_winprob_dict = {}

    for team1, elo1 in final_postcon_team_elo_dict.items():
        pairwise_winprob_dict[team1] = {}
        for team2, elo2 in final_postcon_team_elo_dict.items():
            # no home court boost in march madness
            pairwise_winprob_dict[team1][team2] = get_elo_win_prob(elo1, elo2, home_court_advantage_boost_1 = 0, home_court_advantage_boost_2 = 0)




    brier_score = find_brier_score(tourney_results = round_matchup_dict_2024, 
                                pairwise_matchup_probs = pairwise_winprob_dict)

    return(brier_score)




In [None]:
# example women's grid search ... 

# grid_search(conf_game_k_vals = [20, 30, 40, 50],
#             ooc_game_k_vals = [10, 20, 30, 40],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs', 'barebones_sepconf_diffs', 'full_diffs'],
#             these_mods_to_include=['logreg', 'xgb'],
#             run_name_stem = 'womens_long_gridsearch',
#             )

# # DO update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [10, 20, 30, 40],
#             ooc_game_k_vals = [10, 20],
#             fixed_game_k_vals = [10, 20, 30],
#             this_include_conf_res = False,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_UPDATECONF_NOSEC',
#             )

# # low K values all around
# # DONT update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [5, 10, 15, 20, 25],
#             ooc_game_k_vals = [5, 10, 15, 20],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_lowKs_DONTUPDATECONF_NOSEC',
#             )


# # low K values all around
# # minmax scale ... 
# # DONT update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [5, 10, 15, 20, 25],
#             ooc_game_k_vals = [5, 10, 15, 20],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [MinMaxScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_lowKs_DONTUPDATECONF_NOSEC_minmax',
#             )

# # HIGH CONF
# # DO update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [60, 70, 80, 90],
#             ooc_game_k_vals = [10, 20, 60],
#             fixed_game_k_vals = [30, 40, 50],
#             this_include_conf_res = False,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_UPDATECONF_NOSEC_HIGHCONF',
#             )

# # HIGH CONF
# # DONT update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [60, 70, 80, 90, 100],
#             ooc_game_k_vals = [10, 20],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_DONTUPDATECONF_NOSEC_HIGHCONF',
#             )

# # HIGH CONF but with barebones
# # DO update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [60, 70, 80, 90],
#             ooc_game_k_vals = [10, 20, 60],
#             fixed_game_k_vals = [30, 40, 50],
#             this_include_conf_res = False,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['barebones_sepconf_nondiffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_UPDATECONF_NOSEC_HIGHCONF_BAREBONES',
#             )




# # HIGH CONF but with barebones
# # DONT update elo in conference tourney games
# # DONT include secondary tourney data to train

# grid_search(conf_game_k_vals = [60, 70, 80, 90],
#             ooc_game_k_vals = [10, 20, 60],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['barebones_sepconf_nondiffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_DONTUPDATECONF_NOSEC_HIGHCONF_BAREBONES',
#             )


# # HIGH CONF but with barebones
# # DONT update elo in conference tourney games
# # DO include secondary tourney data to train

# grid_search(conf_game_k_vals = [60, 70, 80, 90],
#             ooc_game_k_vals = [10, 20, 60],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = True,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['barebones_sepconf_nondiffs', 'barebones_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_DONTUPDATECONF_YESSEC_HIGHCONF_BAREBONES',
#             )

# # DO update elo in conference tourney games
# # DO include secondary tourney data to train

# grid_search(conf_game_k_vals = [10, 20, 30, 40],
#             ooc_game_k_vals = [10, 20],
#             fixed_game_k_vals = [10, 20, 30],
#             this_include_conf_res = False,
#             this_include_secondary_res = True,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_UPDATECONF_INCSEC',
#             )

# # DONT update elo in conference tourney games
# # DO include secondary tourney data to train

# grid_search(conf_game_k_vals = [10, 20, 30, 40],
#             ooc_game_k_vals = [10, 20],
#             fixed_game_k_vals = [10, 20, 30],
#             this_include_conf_res = True,
#             this_include_secondary_res = True,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'womens_long_gridsearch_long_DONTUPDATECONF_INCSEC',
#             )

In [None]:
# # men's (among others)
# # validate men's params
# grid_search(conf_game_k_vals = [40],
#             ooc_game_k_vals = [15],
#             fixed_game_k_vals = [1],
#             this_include_conf_res = True,
#             this_include_secondary_res = False,
#             these_scalers_to_include = [StandardScaler()],
#             these_feature_sets_to_include = ['full_sepconf_diffs'],
#             these_mods_to_include=['logreg'],
#             run_name_stem = 'mens_validate_final_params',
#             )