In [1]:
import pandas as pd
import sys
import os
from sklearn import preprocessing

In [2]:
sys.path.append('{}/mmml'.format(os.path.dirname(os.getcwd())))
from mmml.config import data_folder
from mmml.game_results import *
from mmml.utils import *

In [3]:
x_features_dev = pd.read_pickle("{}/Data/Processed/x_features_dev.pkl".format(os.path.dirname(os.getcwd())))
x_features_oot = pd.read_pickle("{}/Data/Processed/x_features_oot.pkl".format(os.path.dirname(os.getcwd())))

In [4]:
x_features_dev.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ast,Blk,DR,FGA,FGA3,FGM,FGM3,FTA,FTM,NLoc,...,wins,possessions,o_eff,d_eff,net_eff,elo,last_elo,MOR,POM,SAG
TeamID,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1171,2012,278,70,667,1457,447,571,148,526,373,2,...,4,1839.825,90.389032,100.661748,-10.272716,"[1500.0, 1491.8493246116866, 1480.0, 1488.4805...",1351.987274,286.0,294.0,300.0
1402,2006,287,93,515,1311,406,542,125,422,289,0,...,2,1630.65,91.865207,110.385429,-18.520222,"[1500.0, 1491.8493246116866, 1483.36879746356,...",1351.248141,316.0,324.0,327.0
1328,2013,384,87,770,1789,472,780,154,646,491,5,...,20,2090.2375,105.490405,98.170662,7.319742,"[1500.0, 1508.1506753883134, 1520.102609012492...",1560.789937,49.0,51.0,43.0


### `fnScaleFeatures` Work
- Apply Min-Max Scaler to Massey Rank columns so we can avg them

In [5]:
feature_list = pd.read_csv('{}/mmml/mmml/feature_list2.csv'.format(os.path.dirname(os.getcwd())))
columns_key = getFeatureDict(feature_list)
print(columns_key)

{'target': ['HScore_diff'], 'features': ['possessions_diff', 'o_eff_diff', 'd_eff_diff', 'net_eff_diff', 'last_elo_diff', 'Avg_Rank_diff'], 'ids': ['HTeamID', 'ATeamID', 'Season', 'GameRound', 'Seed_H', 'Seed_A', 'GameSlot'], 'diff_cols': ['wins_diff', 'possessions_diff', 'o_eff_diff', 'd_eff_diff', 'net_eff_diff', 'last_elo_diff', 'MOR_diff', 'POM_diff', 'SAG_diff', 'Avg_Rank_diff'], 'scale_cols': ['MOR_H', 'POM_H', 'SAG_H', 'MOR_A', 'POM_A', 'SAG_A']}


In [6]:
# Create list of Cols to Scale
scale_cols = columns_key['scale_cols']
scale_cols = list(set([x[:-2] for x in scale_cols])) # Remove _H / _A
print(scale_cols)

['MOR', 'SAG', 'POM']


In [7]:
## Fit Scaler on Training Set
min_max_scaler = preprocessing.MinMaxScaler()
fitted_scaler = min_max_scaler.fit(pd.DataFrame(x_features_dev[scale_cols]))

In [8]:
## Apply Scaler to Test Set
scaled_df = pd.DataFrame(fitted_scaler.transform(x_features_oot[scale_cols]),
    columns=[x+"_scaled" for x in scale_cols], index=x_features_oot.index)

In [9]:
# Calculate Average rank of Scaled Columns
avg_rank = pd.DataFrame(scaled_df[[x+"_scaled" for x in scale_cols]].mean(axis=1), columns=['Avg_Rank'])
avg_rank.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg_Rank
TeamID,Season,Unnamed: 2_level_1
1427,2019,0.522857
1287,2019,0.728571
1354,2019,0.958095


In [10]:
## Append Avg Rank back on to X-Features
scaled_x_features = x_features_oot.merge(avg_rank, left_index=True, right_index=True)
scaled_x_features.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ast,Blk,DR,FGA,FGA3,FGM,FGM3,FTA,FTM,NLoc,...,possessions,o_eff,d_eff,net_eff,elo,last_elo,MOR,POM,SAG,Avg_Rank
TeamID,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1427,2019,362,70,756,1801,752,762,258,472,348,5,...,2070.025,102.897308,102.027753,0.869555,"[1500.0, 1488.4820238179843, 1480.415603408158...",1511.939254,248.0,151.0,153.0,0.522857
1287,2019,400,131,753,1844,714,772,240,625,442,4,...,2202.9,101.048618,109.446639,-8.398021,"[1500.0, 1491.8493246116866, 1484.147568908605...",1452.968527,262.0,255.0,251.0,0.728571
1354,2019,359,53,699,1776,552,748,179,705,524,2,...,2230.925,98.568979,110.85088,-12.281901,"[1500.0, 1492.0748768264345, 1483.911603995309...",1368.327533,330.0,338.0,341.0,0.958095
