In [2]:
# package import
import pandas as pd
import numpy as np
from copy import deepcopy
from datetime import datetime

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE

from umap import UMAP

from xgboost import XGBClassifier, XGBRFClassifier, XGBRegressor, XGBRanker

In [2]:
# functions to retrieve data
def rolling_mean_n_performance(df, window=5, performance_col='goals_scored'):
    dg = df.sort_values(by=['leg'])[['season', 'team', performance_col]].groupby(
        by=['season', 'team'])[performance_col].rolling(window=window, min_periods=1).mean().reset_index()
    
    new_col_name = f'rolling_{window}_games_avg_{performance_col}'

    df[new_col_name] = dg.set_index('level_2')[performance_col]
    return df

def get_past_feature(df, feat_col, team=True):
    merge_col = 'team' if team else 'opponent'    
    tmp_df = deepcopy(df[['season', 'leg', merge_col, feat_col]])
    tmp_df.loc[:,'next_leg'] = tmp_df['leg'] +1

    tmp_df.rename(columns={'leg': 'previous_leg', 
                           'next_leg':'leg', 
                           feat_col:f'previous_{merge_col}_{feat_col}'},
                  inplace=True)

    df = df.merge(tmp_df, how='left', on=['leg', 'season', merge_col])
    df.drop(columns=['previous_leg'], inplace=True)
    # print(f"length df : {len(df)}")
    return df

def prepare_data(csv_path, rolling=5):
    df = pd.read_csv(csv_path).drop(columns='Unnamed: 0')
    df['goal_diff'] = df['goals_scored'] - df['goals_conceded']
    # cumulative
    df['cum_pts'] = df[['season', 'team', 'nb_points']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goal_diff'] = df[['season', 'team', 'goal_diff']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_scored'] = df[['season', 'team', 'goals_scored']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_conceded'] = df['cum_goals_scored']-df['cum_goal_diff']
    df['rank'] = df[['season', 'leg', 'cum_pts', 'cum_goal_diff', 'cum_goals_scored']].sort_values(
        by=['cum_pts', 'cum_goal_diff', 'cum_goals_scored'], ascending=False).groupby(
        by=['season', 'leg']).cumcount() + 1
    
    df['avg_goals_scored_since_season_start'] = df['cum_goals_scored'].div(df['leg'])
    df['avg_goals_conceded_since_season_start'] = df['cum_goals_conceded'].div(df['leg'])
    df['avg_cum_pts_since_season_start'] = df['cum_pts'].div(df['leg'])
    
    # removed unwanted useless seasons
    data = deepcopy(df[df.season > '2003-2004'])
    data.reset_index(drop=True, inplace=True)
    
    leg_max = data.leg.max()
    
    end_season = data[data.leg==leg_max].rename(columns={'rank':'final_rank', 'cum_pts': 'final_cum_pts'})
    data = data.merge(end_season[['season', 'team', 'final_rank', 'final_cum_pts']], on=['season', 'team'])
    
    # rolling mean
    cols = ['goals_conceded', 'goals_scored', 'nb_points']
    for c in cols:
        data = rolling_mean_n_performance(df=data, window=rolling, performance_col=c)
    # past features
    past_features = {'rank': [True, False], 
                     'rolling_5_games_avg_goals_scored' : [True],
                     'rolling_5_games_avg_goals_conceded': [False],
                     'avg_goals_scored_since_season_start': [True],
                     'avg_goals_conceded_since_season_start': [False],
                     'goals_scored': [True],
                     'goals_conceded': [False],
                     'rolling_5_games_avg_nb_points': [True, False],
                     'nb_points': [True, False]
                    }
    # print(f'length {len(data)}')
    for col, is_team_ll in past_features.items():
        for is_team in is_team_ll:
            # print(f"is_team ={is_team}, col = {col}")
            data = get_past_feature(df=data, feat_col=col, team=is_team)
    
    return data


def get_lr_parameters(data: pd.DataFrame):
    tmp_x_data = data.leg.values
    tmp_y_data = data.cum_pts.values
    
    x_data = np.array([0] + list(tmp_x_data)).reshape(-1, 1)
    y_data = np.array([0] + list(tmp_y_data)).reshape(-1, 1)
    
    reg = LinearRegression(fit_intercept=False).fit(X=x_data, y=y_data)
    
    return reg.coef_[0][0] , reg.score(X=x_data, y=y_data)

def get_season_team_data(break_leg: int, data: pd.DataFrame):
    """
    break_leg : split the season in two parts at this leg
    data: contains the season pts evolution wrt to legs for 1 team and 1 season
    
    returns: the Linear Regression parameters for both parts
    """
    data = data.reset_index(drop=True)
    
    breaking_cum_goal_diff = data.loc[break_leg-1, 'cum_goal_diff']
    breaking_cum_goal_scored = data.loc[break_leg-1, 'cum_goals_scored']
    roll_trend = data.loc[break_leg-1, 'rolling_5_games_avg_nb_points']
    trend = data.loc[break_leg-1,'avg_cum_pts_since_season_start']
    
    train_data = deepcopy(data[data.leg <= break_leg])
    eval_data = deepcopy(data[data.leg > break_leg])
    
    pts_at_break = data.loc[break_leg-1, 'cum_pts']
    eval_data.cum_pts -= pts_at_break
    eval_data.leg -= break_leg
    
    coef_feat, r_score_feat = get_lr_parameters(data=train_data)
    coef_predict, r_score_predict = get_lr_parameters(data=eval_data) if len(eval_data) > 0 else (-10, -10)
    
    final_nb_pts = data.loc[len(data)-1, 'cum_pts']
    
    nb_games_at_home = len(eval_data[eval_data.play=='Home'])
    
    return [nb_games_at_home, coef_feat, coef_predict, pts_at_break, final_nb_pts, 
            r_score_feat, r_score_predict, breaking_cum_goal_diff, breaking_cum_goal_scored,
            roll_trend, trend]

def build_data(historical_data: pd.DataFrame, break_leg: int):
    season_team_all = historical_data[['season', 'team']].values
    data_for_model = []
    ids_data = []
    # season_value = {s : i for i, s in enumerate(np.sort(df.season.unique()), start=1)}
    for season_team in np.unique(['###'.join(ll) for ll in season_team_all]):
        season, team = season_team.split("###")
        
        evol_feat = get_season_team_data(
            break_leg=break_leg, 
            data=deepcopy(historical_data[(historical_data.season==season) & (historical_data.team==team)]
                         )
        )
        data_for_model.append(evol_feat) # [season_value[season]]+evol_feat)
        ids_data.append([season, team])
    
    data_df = pd.DataFrame(columns=['nb_games_to_play_at_home',
                                    'lr_feat_coeff', 'lr_predict_coeff',
                                    'nb_pts_at_break', 'final_nb_pts', 
                                    'r_score_feat', 'r_score_predict', 
                                    'cumulative_goal_diff_at_break', 
                                    'cumulative_goal_scored_at_break', 
                                    'rolling_5_avg_pts_at_break', 
                                    'season_trend_at_break'
                                    ], 
                             data=np.array(data_for_model)
                            )
    
    ids_df = pd.DataFrame(columns=['season', 'team'], data=np.array(ids_data))

    return pd.concat([ids_df, data_df], axis=1)
    
    
    
def get_pivoted(data: pd.DataFrame, break_leg: int, value_col: str = 'cum_pts'):
    
    df = deepcopy(data[data.leg <= break_leg])
    df.rolling_5_games_avg_nb_points = [y if x!=x else x for x, y in 
           zip(df.rolling_5_games_avg_nb_points, df.avg_cum_pts_since_season_start)]
    
    df_pivot = df.pivot_table(index=['season', 'team'], 
                              columns='leg', 
                              values=[value_col]).reset_index()
    
    df_pivot.columns = [f'leg_{l}' if l!='' else n for n, l in df_pivot.columns]
    
    final = df[['season', 'team', 'final_rank', 'final_cum_pts']].drop_duplicates()
    df_last_leg = df[df.leg==break_leg][['season', 
                                         'team', 
                                         'rank', 
                                         'rolling_5_games_avg_nb_points', 
                                         'avg_cum_pts_since_season_start', 
                                         'cum_pts', 
                                         'cum_goal_diff', 
                                         'cum_goals_scored']].reset_index(drop=True)
    
    df_pivot = df_pivot.merge(df_last_leg, on=['season', 'team'])
    
    return df_pivot.merge(final, on=['season', 'team']).rename(columns={'final_cum_pts': 'final_nb_pts'})

In [3]:
championship_csv = {'ligue-1': 'ligue-1_data_2002_2019',
                   'ligue-2': 'ligue-2_data_2002_2019',
                   'serie-A': 'serie-a_data_2004_2019',
                   'bundesliga': 'bundesliga_data_2004_2019',
                   'premier-league': 'premier-league_data_2004_2019',
                   'liga':'liga_data_2004_2019'}

# The Rank Predicter


### Regression approach

In [4]:
def fit_general_model(
    data: pd.DataFrame, 
    feature_cols, 
    target_col, 
    test_frac = .2,
    model_type = 'lin_reg'):
    
    assert model_type in ['lin_reg', 'xgboost', 'random_forest']
    
    data.reset_index(drop=True, inplace=True)
    
    
    features = data[feature_cols].values
    target = data[target_col].values
    
    # split data
    x_train, x_test, y_train, y_test = train_test_split(features, 
                                                        target,
                                                        test_size=test_frac,
                                                        random_state=42)
    
    # get model
    if model_type=='random_forest':
        model = RandomForestRegressor().fit(X=x_train, y=y_train)
    elif model_type=='xgboost':
        model = XGBRegressor().fit(X=x_train, y=y_train)
    else:
        model = LinearRegression().fit(X=x_train, y=y_train)
        
    pred = model.predict(x_test) 

    # R2_score :
    r2_score_train = model.score(X=x_train, y=y_train)
    r2_score_test = model.score(X=x_test, y=y_test)

     # RMSE Computation 
    rmse = np.sqrt(MSE(y_test, pred))
    
    return model, {'r2_score_train': r2_score_train, 'r2_score_test': r2_score_test, 'rmse_test': rmse} 


def predict(scikit_model, feature: np.array):

    feature = feature.reshape(1, -1)
    prediction = scikit_model.predict(feature)
    
    return prediction[0]


def compute_expected_final_nb_points(lin_coeff, nb_pts_at_break, breaking_leg, final_leg, basic=True):
    # print(f'inputs are : lin_coeff={lin_coeff}, nb_pts_at_break={nb_pts_at_break}, breaking_leg={breaking_leg} and final_leg={final_leg}')
    # print(f"-- basic is {basic}")
    if basic:
      #  print(f'---- output is {lin_coeff*final_leg}')
        return lin_coeff*final_leg

    else:
        # print(f'---- output is {nb_pts_at_break + lin_coeff*(final_leg - breaking_leg)}')
        return nb_pts_at_break + lin_coeff*(final_leg - breaking_leg)

def basic(data_training, data_evaluation, feature_cols, breaking_leg=27, final_leg=38):
    cols = ['lr_feat_coeff', 'nb_pts_at_break']

    data_evaluation['predicted_final_nb_pts'] = data_evaluation[cols].apply(
        lambda r: compute_expected_final_nb_points(lin_coeff=r[0],
                                                   nb_pts_at_break=r[1],
                                                   breaking_leg=breaking_leg,
                                                   final_leg=final_leg, 
                                                   basic=True), 
        axis=1)

    return data_evaluation

def ranker(data_training, data_evaluation, feature_cols, model_type,
              breaking_leg=27, final_leg=38):

    # fit the model
    model, metadata = fit_general_model(
        data=data_training, 
        feature_cols=feature_cols, 
        target_col='lr_predict_coeff',
        model_type=model_type)
    # get the predicted number of points
    data_evaluation[f'predicted_{model_type}_predict_coeff'] = data_evaluation[feature_cols].apply(
        lambda x: predict(scikit_model=model, 
                          feature=np.array(x)),
        axis=1)

    cols = [f'predicted_{model_type}_predict_coeff', 'nb_pts_at_break']
        
    data_evaluation['predicted_final_nb_pts'] = data_evaluation[cols].apply(
        lambda r: compute_expected_final_nb_points(lin_coeff=r[0],
                                                   nb_pts_at_break=r[1],
                                                   breaking_leg=breaking_leg,
                                                   final_leg=final_leg, 
                                                   basic=False), 
        axis=1)
    
    rmse = np.sqrt(MSE(data_evaluation['final_nb_pts'].values,
                       data_evaluation[f'predicted_final_nb_pts'].values)
                  )
    metadata['rmse_eval'] = rmse
    
    return data_evaluation, metadata

def points_predicter(data_training, data_evaluation, feature_cols, target_col, model_type):

    model, metadata = fit_general_model(data=data_training, 
                                        feature_cols=feature_cols, 
                                        target_col=target_col,
                                        model_type=model_type)
    # get the predicted number of points
    data_evaluation[f'predicted_{target_col}'] = data_evaluation[feature_cols].apply(
        lambda x: predict(scikit_model=model, 
                          feature=np.array(x)),
        axis=1)

    # RMSE Computation 
    rmse = np.sqrt(MSE(data_evaluation[target_col].values,
                       data_evaluation[f'predicted_{target_col}'].values)
                  )
    metadata['rmse_eval'] = rmse
    return data_evaluation, metadata

def points_to_rank(season_data: pd.DataFrame, pts_col_name: str, rank_name: str):
    rank_df = season_data.sort_values(by=pts_col_name, ascending=False).reset_index(drop=True)
    rank_df[rank_name] = rank_df.index +1
    return rank_df


In [24]:
def get_regression_rankers(validation: pd.DataFrame, training: pd.DataFrame, breaking_leg):
    final_leg=training.leg.max()
    # data formating
    training_data_for_model = build_data(historical_data=training, break_leg=breaking_leg)
    validation_data_for_model = build_data(historical_data=validation, break_leg=breaking_leg)
    
    training_pivoted = get_pivoted(data=training, break_leg=breaking_leg)
    validation_pivoted = get_pivoted(data=validation, break_leg=breaking_leg)
    
    training_all_features = training_pivoted.merge(
        training_data_for_model[['season', 'team', 'nb_games_to_play_at_home', 'lr_feat_coeff']],
        on=['season', 'team'])
    validation_all_features = validation_pivoted.merge(
        validation_data_for_model[['season', 'team', 'nb_games_to_play_at_home', 'lr_feat_coeff']], 
        on=['season', 'team'])
    
    # features columns
    init_feat_cols= ['lr_feat_coeff', 'nb_pts_at_break']
    extra_cols = ['cumulative_goal_diff_at_break', 'rolling_5_avg_pts_at_break']
    feature_cols = init_feat_cols + extra_cols + ['nb_games_to_play_at_home']
    # feat_cols_basic = init_feat_cols + extra_cols
    feat_cols_pivoted = [c for c in training_pivoted.columns if c.startswith('leg')]
    feat_cols_pivoted += ['rolling_5_games_avg_nb_points', 'avg_cum_pts_since_season_start', 
                          'cum_goal_diff', 'cum_goals_scored']
    feat_cols_all = feat_cols_pivoted + ['nb_games_to_play_at_home', 'lr_feat_coeff']
    
    regression_metadata_records = []
    # basic
    naive = basic(data_training=deepcopy(training_data_for_model),
                   data_evaluation=deepcopy(validation_data_for_model),
                   feature_cols=feature_cols,
                   breaking_leg=breaking_leg,
                   final_leg=final_leg)
    # 1. Between nb_points at breaking leg + nb_games_to_play_at_home vs final nb_points
    lr_1, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=['nb_games_to_play_at_home','nb_pts_at_break'],
                                  target_col='final_nb_pts',
                                  model_type='lin_reg') # final_cum_pts
    meta['model'] = 'lr_1'
    regression_metadata_records.append(meta)
    rf_1, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=['nb_games_to_play_at_home','nb_pts_at_break'],
                                  target_col='final_nb_pts',
                                  model_type='random_forest')
    meta['model'] = 'rf_1'
    regression_metadata_records.append(meta)
    xgb_1, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=['nb_games_to_play_at_home','nb_pts_at_break'],
                                  target_col='final_nb_pts',
                                  model_type='xgboost')
    meta['model'] = 'xgb_1'
    regression_metadata_records.append(meta)
    
    # 2. Linear Regression on lr_feat_coeff vs final nb_points
    lr_2, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=['lr_feat_coeff'],
                                  target_col='final_nb_pts',
                                  model_type='lin_reg')
    meta['model'] = 'lr_2'
    regression_metadata_records.append(meta)
    
    # 3. Between nb_points at break leg + lr_feat_coeff vs final nb_points
    lr_3, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=feature_cols,
                                  target_col='final_nb_pts',
                                  model_type='lin_reg')
    meta['model'] = 'lr_3'
    regression_metadata_records.append(meta)
    rf_2, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=feature_cols,
                                  target_col='final_nb_pts',
                                  model_type='random_forest')
    meta['model'] = 'rf_2'
    regression_metadata_records.append(meta)
    xgb_2, meta = points_predicter(data_training=deepcopy(training_data_for_model),
                                  data_evaluation=deepcopy(validation_data_for_model),
                                  feature_cols=feature_cols,
                                  target_col='final_nb_pts',
                                  model_type='xgboost')
    meta['model'] = 'xgb_2'
    regression_metadata_records.append(meta)

    # 4. Linear Regression on pivoted data...
    lr_4, meta = points_predicter(data_training=deepcopy(training_pivoted),
                                  data_evaluation=deepcopy(validation_pivoted),
                                  feature_cols=feat_cols_pivoted,
                                  target_col='final_nb_pts',
                                  model_type='lin_reg')
    meta['model'] = 'lr_4'
    regression_metadata_records.append(meta)
    
    # 5. On pivoted data + lr_trends.
    rf_3, meta = points_predicter(data_training=deepcopy(training_all_features),
                                  data_evaluation=deepcopy(validation_all_features),
                                  feature_cols=feat_cols_all,
                                  target_col='final_nb_pts',
                                  model_type='random_forest')
    meta['model'] = 'rf_3'
    regression_metadata_records.append(meta)
    xgb_3, meta = points_predicter(data_training=deepcopy(training_all_features),
                                  data_evaluation=deepcopy(validation_all_features),
                                  feature_cols=feat_cols_all,
                                  target_col='final_nb_pts',
                                  model_type='xgboost')
    meta['model'] = 'xgb_3'
    regression_metadata_records.append(meta)
    lr_5, meta = points_predicter(data_training=deepcopy(training_all_features),
                                  data_evaluation=deepcopy(validation_all_features),
                                  feature_cols=feat_cols_all,
                                  target_col='final_nb_pts',
                                  model_type='lin_reg')
    meta['model'] = 'lr_5'
    regression_metadata_records.append(meta)
    
    # ranker directly
    rf_4, meta = ranker(data_training=deepcopy(training_data_for_model),
                        data_evaluation=deepcopy(validation_data_for_model),
                        feature_cols=feature_cols,
                        model_type='random_forest', 
                        breaking_leg=breaking_leg,
                        final_leg=final_leg)
    meta['model'] = 'rf_4'
    regression_metadata_records.append(meta)
    xgb_4, meta = ranker(data_training=deepcopy(training_data_for_model),
                         data_evaluation=deepcopy(validation_data_for_model),
                         feature_cols=feature_cols,
                         model_type='xgboost', 
                         breaking_leg=breaking_leg,
                         final_leg=final_leg)
    meta['model'] = 'xgb_4'
    regression_metadata_records.append(meta)
    lr_6, meta = ranker(data_training=deepcopy(training_data_for_model),
                        data_evaluation=deepcopy(validation_data_for_model),
                        feature_cols=feature_cols,
                        model_type='lin_reg',
                        breaking_leg=breaking_leg,
                        final_leg=final_leg)
    meta['model'] = 'lr_6'
    regression_metadata_records.append(meta)


    # random forest : Between nb_points at break leg + lr_feat_coeff vs final nb_points
    #rf_4 = deepcopy(overall_rf_3[overall_rf_3.season==val_season])

    regression_rankers = {
    'lr_1' : lr_1,
    'lr_2' : lr_2,
    'lr_3' : lr_3,
    'lr_4' : lr_4,
    'lr_5' : lr_5,
    'lr_6' : lr_6,
    'naive' : naive,
    'rf_1' : rf_1,
    'rf_2' : rf_2,
    'rf_3' : rf_3,
    'rf_4' : rf_4,
    'xgb_1' : xgb_1,
    'xgb_2' : xgb_2,
    'xgb_3' : xgb_3,
    'xgb_4' : xgb_4
    }
    
    vote_casting_regression = {idx : list(getattr(points_to_rank(season_data=data, 
                                                             pts_col_name= 'predicted_final_nb_pts', 
                                                             rank_name='predicted_rank'
                                                            ), 'team')) 
                               for idx, data in regression_rankers.items()
                              }
    print('---- Regression rankers DONE')
    return vote_casting_regression, regression_metadata_records


### Classification/Clustering/General Ranking Approach

In [6]:
def score_to_rank(season_df: pd.DataFrame, scores: np.array, col_name: str):
    tmp = pd.DataFrame(data=scores, columns=[col_name])

    output_df = pd.concat([season_df[['season', 'team', 'final_rank']], tmp], axis=1)

    output_df['predicted_rank'] = output_df[col_name].rank()
    return output_df

def compute_umap_score(position_weight : np.array, 
                   historical_umap_pos: np.array,  
                   team_season_umap_pos: np.array):
    """
    historical_umap_pos : shape (N,3) where N = nb_teams x nb_seasons
    team_season_map : shape (2,)
    """
    score = 0
    dists = []
    final_ranks = historical_umap_pos.transpose()[0]
    for r in historical_umap_pos:
        dist = np.linalg.norm(r[1:]-team_season_umap_pos)
        dists.append(dist)

    dg = pd.DataFrame(data={'dists': dists, 'final_rank': final_ranks}).sort_values(
        by='dists').reset_index(drop=True).reset_index()
    dg['index'] += 1
    dg['weigth'] = 1/dg['index']
    return dg.final_rank.dot(dg.weigth) # score

def get_umaped_data(data, feat_cols): # data=data_training[feat_cols]
    reducer = UMAP()
    embedding_umap = reducer.fit_transform(data[feat_cols].values)
    
    tmp_df = deepcopy(data[['season', 'team', 'final_rank']])
    embed_columns = ["x_col_umap_embed", "y_col_umap_embed"]
    embedding_df = pd.DataFrame(data=embedding_umap, columns=embed_columns)
    embedding_df = pd.concat([tmp_df, embedding_df], axis=1)
    return embedding_df, embed_columns

def get_umap_ranker(umaped_data_training, data_evaluation, embed_columns):
    nb_teams = data_evaluation.team.nunique()
    position_weights = np.array([r+np.exp(np.log(100)*r/nb_teams) for r in range(1, nb_teams + 1)])
    
    scores = [compute_umap_score(position_weight=position_weights, 
                    historical_umap_pos=umaped_data_training[['final_rank']+embed_columns].values,
                    team_season_umap_pos=team_pos
                   ) for team_pos in data_evaluation[embed_columns].values]
    
    return score_to_rank(season_df=data_evaluation, scores=scores, col_name='fuzzy_umap')

def get_xgboost_rank_ranker(training_data_df, validation_df, feature_cols):
    ranker = XGBRanker()
    nb_teams = validation_df.team.nunique()
    nb_training_seasons = training_data_df.season.nunique()
    group = np.array([nb_teams]*nb_training_seasons)
    
    training_data_df_sorted = training_data_df.sort_values(by='season').reset_index(drop=True)
    ranker.fit(X=training_data_df_sorted[feature_cols].values, 
           y=training_data_df_sorted['final_rank'].values,
           group=group)
    
    # ranker.predict(np.array([one_season_umap_valid[0]]))
    # the lower the better
    ranker_vals = ranker.predict(validation_df[feature_cols].values)
    tmp = pd.DataFrame(data=ranker_vals, columns=['xgb_ranker'])
    output_df = pd.concat([validation_df[['season', 'team', 'final_rank']], tmp], axis=1)
    output_df['predicted_rank'] = output_df['xgb_ranker'].rank()
    
    return output_df

def get_gradient_boosting_classifier_ranker(training_data_df, 
                                            validation_df,
                                            feature_cols,
                                            model_type='simple_classifier'
                                           ):
    # training_data_df = train_pivoted_df[feat_cols] 
    assert model_type in ['simple_classifier', 'rf_classifier']
    
    nb_teams = validation_df.team.nunique()
    
    if model_type == 'rf_classifier':
        classifier = XGBRFClassifier()
        core = 'xgb'
    else:
        classifier = XGBClassifier()
        core = 'xgbrf'
    
    classifier.fit(X=training_data_df[feature_cols].values, y=training_data_df['final_rank'].values, 
                 eval_metric='mlogloss')
    
    # compute the probabilities to belong to the different classes
    probs = classifier.predict_proba(validation_df[feature_cols].values)
    
    weights = np.array([r+np.exp(np.log(100)*r/nb_teams) for r in range(1, nb_teams + 1)])
    
    evaluation = np.array([np.dot(probs[i], weights) for i in range(len(probs))])
    
    return score_to_rank(season_df=validation_df, 
                          scores=evaluation, 
                          col_name=f'{core}_classifier_umap')

In [7]:
def get_ccdr_rankers(validation: pd.DataFrame, training: pd.DataFrame, break_leg: int):
    final_leg=training.leg.max()
    ccdr_voters = dict()
    # data formating
    train_pivoted_df = get_pivoted(data=training, break_leg=break_leg)
    valid_pivoted_df = get_pivoted(data=validation, break_leg=break_leg)
    
    # features columns
    feat_cols = [c for c in train_pivoted_df.columns if c.startswith('leg')]
    feat_cols += ['rolling_5_games_avg_nb_points', 'avg_cum_pts_since_season_start', 
                  'cum_goal_diff', 'cum_goals_scored']
    
    umaped_train_df, embed_columns = get_umaped_data(data=train_pivoted_df, feat_cols=feat_cols)
    umaped_valid_df, _ = get_umaped_data(data=valid_pivoted_df, feat_cols=feat_cols)
    # fuzzy_umap
    ccdr_voters['fuzzy_umap'] = get_umap_ranker(umaped_data_training=deepcopy(umaped_train_df),
                                                data_evaluation=deepcopy(umaped_valid_df),
                                                embed_columns=embed_columns)
    # gradient boosting classifiers on umaped data
    ccdr_voters['xgb_class_umap'] = get_gradient_boosting_classifier_ranker(
        training_data_df=deepcopy(umaped_train_df),
        validation_df=deepcopy(umaped_valid_df),
        feature_cols=embed_columns,
        model_type='simple_classifier')
    
    ccdr_voters['xgbrf_class_umap'] = get_gradient_boosting_classifier_ranker(
        training_data_df=deepcopy(umaped_train_df),
        validation_df=deepcopy(umaped_valid_df),
        feature_cols=embed_columns,
        model_type='rf_classifier')
    # gradient boosting classifiers on raw data
    ccdr_voters['xgb_class_raw'] = get_gradient_boosting_classifier_ranker(
        training_data_df=deepcopy(train_pivoted_df),
        validation_df=deepcopy(valid_pivoted_df),
        feature_cols=feat_cols,
        model_type='simple_classifier')
    ccdr_voters['xgbrf_class_raw'] = get_gradient_boosting_classifier_ranker(
        training_data_df=deepcopy(train_pivoted_df),
        validation_df=deepcopy(valid_pivoted_df),
        feature_cols=feat_cols,
        model_type='rf_classifier')
    # gradient boosting ranker on umaped data
    ccdr_voters['xgb_ranker_umap'] = get_xgboost_rank_ranker(
        training_data_df=deepcopy(umaped_train_df),
        validation_df=deepcopy(umaped_valid_df),
        feature_cols=embed_columns)
    # gradient boosting ranker on raw data
    ccdr_voters['xgb_ranker_raw'] = get_xgboost_rank_ranker(
        training_data_df=deepcopy(train_pivoted_df),
        validation_df=deepcopy(valid_pivoted_df),
        feature_cols=feat_cols)
    
    sort_results = {model : df.sort_values(by='predicted_rank').reset_index(drop=True) 
                    for model, df in ccdr_voters.items()
                   }
    print('---- Ccdr rankers DONE')
    return {model : list(df.team) for model, df in sort_results.items()}

### Markov Voter

In [8]:
# We build a Markov Chain based on all rankers : every rankers is voting

def rank_to_transition_mat(ranking, matrix, team_to_idx):
    for j in range(len(ranking)-1):
        fro = team_to_idx[ranking[j+1]]
        to = team_to_idx[ranking[j]]
        matrix[to][fro] += 1
    return matrix


def get_self_loop(matrix):
    for i in range(matrix.shape[0]):
        matrix[i][i] = matrix[i].sum()

    return matrix


def normalize(matrix: np.array):
    matrix_t = matrix.transpose()
    for i in range(matrix.shape[0]):
        matrix_t[i] = matrix_t[i] / (matrix_t[i].sum())

    return matrix_t.transpose()


def get_transition_matrix(team_to_idx, rankers=None):
    if rankers is None:
        raise ValueError("I Need voters: no voter, not transition matrix")
    
    nb_teams = len(team_to_idx)
    mat = np.zeros(nb_teams**2).reshape(nb_teams, nb_teams)
    for _, ranking in rankers.items():
        # print(f'-{_}')
        mat = rank_to_transition_mat(ranking=ranking,
                                     matrix=mat,
                                     team_to_idx=team_to_idx
                                     )
    mat = get_self_loop(matrix=mat)
    mat = normalize(matrix=mat)
    return mat


def get_invariant_probability(matrix, num_iterations: int = 100, d: float = .85):
    """PageRank: The trillion dollar algorithm.

    Parameters
    ----------
    matrix : numpy array
        adjacency matrix where M_i,j represents the link from 'j' to 'i',
        such that for all 'j' sum(i, M_i,j) = 1. The matrix is irreducible
    num_iterations : int, optional
        number of iterations, by default 100
    d : float, optional
        damping factor, by default 0.85

    Returns
    -------
    numpy array
        the vector of invariant probability
        v sums to 1

    """
    n = matrix.shape[1]
    # v = np.random.rand(N, 1)
    inv_prb = np.ones((n, 1))
    inv_prb = inv_prb / np.linalg.norm(inv_prb, 1)
    matrix_hat = (d * matrix + (1 - d) / n)
    for i in range(num_iterations):
        inv_prb = matrix_hat @ inv_prb
    return inv_prb


def rankers_vote(rankers: dict, teams: list):
    # replace below ['A', 'B', 'C', 'D', 'E'] by teams
    team_to_idx = {p_id: i for i, p_id in enumerate(teams)} 
    idx_to_team = {i: p_id for i, p_id in enumerate(teams)}

    trans_mat = get_transition_matrix(team_to_idx=team_to_idx,
                                      rankers=rankers)

    v = get_invariant_probability(matrix=trans_mat, num_iterations=10, d=.95)
    
    voting_results = {idx_to_team[i] : v[i][0] for i in range(len(teams))}
    
    df = pd.DataFrame(data={"team": list(voting_results.keys()), "score": list(voting_results.values())})
    df.sort_values(by='score', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['markov_voters_predicted_rank'] = df.index + 1
    print("---- Markov Rankers vote is DONE")
    return df

# Evaluation criterion

In [9]:
def get_bonus_position(nb_teams):
    bonus_position = {1: 300, 2: 280, 3: 270, 4: 260, 5: 220}
    
    bonus_position[nb_teams] = 350
    bonus_position[nb_teams - 1] = 340
    bonus_position[nb_teams - 2] = 300
    bonus_position[nb_teams - 3] = 280
    
    for rk in range(6, nb_teams-3):
        upper = ((nb_teams/2)-3)**2
        lower = ((nb_teams/2)-5)**2
        if 2*rk<nb_teams:
            bonus_position[rk] = int(220*(rk-nb_teams/2)**2/lower) 
        else:
            bonus_position[rk] = int(280*((rk-nb_teams/2)**2)/upper)
        
    return bonus_position

In [10]:
def get_bonus(nb_teams):
    max_bonus = 250*nb_teams
    return {1+ i//2 if i%2==0 else nb_teams - i//2 : max_bonus - i*250  for i in range(nb_teams)}

In [11]:
get_bonus(20)

{1: 5000,
 20: 4750,
 2: 4500,
 19: 4250,
 3: 4000,
 18: 3750,
 4: 3500,
 17: 3250,
 5: 3000,
 16: 2750,
 6: 2500,
 15: 2250,
 7: 2000,
 14: 1750,
 8: 1500,
 13: 1250,
 9: 1000,
 12: 750,
 10: 500,
 11: 250}

In [31]:
def get_ranking_performance_summary(df, season, break_leg):
    validation = deepcopy(df[df.season==season]).reset_index(drop=True)
    training = deepcopy(df[df.season!=season]).reset_index(drop=True)
    nb_teams = validation.team.nunique()
    base_val = deepcopy(validation)[
        ['season', 'team', 'final_rank', 'final_cum_pts']].drop_duplicates().reset_index(drop=True)
    
    bonus_position = get_bonus_position(nb_teams = validation.team.nunique())
    # bonus_position = get_bonus(nb_teams = validation.team.nunique())
    
    base_val['base_gain'] = base_val[['final_rank', 'final_cum_pts']].apply(
        lambda r: bonus_position[r[0]]+r[1], axis=1)
    deepcopy(base_val).sort_values(by='base_gain', ascending=False).reset_index(drop=True)
    
    base_val.sort_values(by='base_gain', ascending=False, inplace=True)
    base_val.reset_index(drop=True, inplace=True)
    base_val['inv_discount'] = base_val.index + 1
    
    base_val['gain'] = base_val[['base_gain', 'inv_discount']].apply(lambda r: r[0]/np.log2(1+r[1]), axis=1)
    
    rank_to_inv_discount = {rk : inv_disc for rk, inv_disc in zip(base_val.final_rank, base_val.inv_discount)}
    
    # get the differents rankers
    # -- regression like
    vote_casting_regression, metadata_records = get_regression_rankers(
        validation=validation, 
        training=training,
        breaking_leg=break_leg)
    metadata_records_df = pd.DataFrame(data=metadata_records)
    # -- classification/clustering/general ranking
    vote_casting_ccdr = get_ccdr_rankers(validation=validation,
                                         training=training,
                                         break_leg=break_leg)
    
    # -- markov
    global voters
    voters = {**vote_casting_regression, **vote_casting_ccdr}
    markov_voters = rankers_vote(rankers=voters, teams=list(validation.team.unique()))
    # Process with evaluation
    all_voters_dict = {**{"markov_voters": markov_voters[['team', 'markov_voters_predicted_rank']]},
                   **{model: pd.DataFrame(
                       data={'team': data, f'{model}_predicted_rank': list(range(1,nb_teams+1))}
                   )
                     for model, data in voters.items()}
                  }
    
    
    all_voters_dict_2_b = {
        model : dg.merge(base_val[['team', 'base_gain']], on='team').rename(
            columns={'base_gain': f'base__{model}_gain'}) 
        for model, dg in all_voters_dict.items()
    }
    
    all_voters_dict_3 = dict()
    
    for model, dg in all_voters_dict_2_b.items():
        dg[f'{model}_gain_2'] = dg[[f'base__{model}_gain', f'{model}_predicted_rank']].apply(
            lambda r: r[0]/np.log2(1+rank_to_inv_discount[r[1]]), axis=1)
        all_voters_dict_3[model] = dg[['team', f'{model}_predicted_rank', f'{model}_gain_2']]
    
    quality_dcg_score = {model : getattr(dg, f'{model}_gain_2').sum()
                         for model, dg in all_voters_dict_3.items()}
    
    base_quality_score = base_val.gain.sum()
    
    return ({model : round(100*score/base_quality_score, 2) for model, score in quality_dcg_score.items()}, 
            metadata_records_df)

# Run the experiment

In [13]:
championship_dict_df = {champ: prepare_data(csv_path=path) for champ, path in championship_csv.items()}

In [32]:
%%time
start_season = '2004-2005'
end_season = '2018-2019'
break_leg=27

start = int(start_season.split('-')[0])
end = int(end_season.split('-')[1])
record_list = []
metadata_df_list = []

for start_season_year in range(start, end):
    season = f'{start_season_year}-{start_season_year+1}'
    for championship, data in championship_dict_df.items():
        print(f'Run {championship} on season {season}')
        s = datetime.now()
        record = {'championship': championship, 'season': season}
        ranking_perf_summary_record, metadata_df = get_ranking_performance_summary(
            df=data, 
            season=season, 
            break_leg=break_leg)
        metadata_df['season'] = season
        metadata_df['championship'] = championship
        record = {**record, **ranking_perf_summary_record}
        record_list.append(record)
        metadata_df_list.append(metadata_df)
        e = datetime.now()
        print(f'This loop took {(e-s).total_seconds()} seconds')

recap_df = pd.DataFrame(data=record_list)

Run ligue-1 on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 10.090701 seconds
Run ligue-2 on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.786718 seconds
Run serie-A on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 10.11142 seconds
Run bundesliga on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.346173 seconds
Run premier-league on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 10.517627 seconds
Run liga on season 2004-2005
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.596294 seconds
Run ligue-1 on season 2005-2006
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov R

---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.383933 seconds
Run serie-A on season 2013-2014
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.221163 seconds
Run bundesliga on season 2013-2014
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.891928 seconds
Run premier-league on season 2013-2014
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.312519 seconds
Run liga on season 2013-2014
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 9.431071 seconds
Run ligue-1 on season 2014-2015
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop took 10.185348 seconds
Run ligue-2 on season 2014-2015
---- Regression rankers DONE
---- Ccdr rankers DONE
---- Markov Rankers vote is DONE
This loop too

In [None]:
e = datetime.now()

In [None]:
(e-s).total_seconds()

In [None]:
# voters.keys()

In [33]:
recap_df.to_csv("performance_recap_rank_predicter_2_get_bonus_position")

In [26]:
pd.concat(metadata_df_list, axis=0, ignore_index=True)

Unnamed: 0,r2_score_train,r2_score_test,rmse_test,rmse_eval,model,season,championship
0,0.901441,0.851694,4.507791,5.219810,lr_1,2004-2005,ligue-1
1,0.932930,0.821168,4.950023,5.323954,rf_1,2004-2005,ligue-1
2,0.935640,0.780312,5.486395,5.946314,xgb_1,2004-2005,ligue-1
3,0.868679,0.756511,5.775958,5.240258,lr_2,2004-2005,ligue-1
4,0.904048,0.846822,4.581228,5.113019,lr_3,2004-2005,ligue-1
...,...,...,...,...,...,...,...
1255,1.000000,0.878982,5.384841,5.007267,xgb_3,2018-2019,liga
1256,0.940768,0.905635,4.755022,4.028007,lr_5,2018-2019,liga
1257,0.910559,0.250493,0.471133,4.391987,rf_4,2018-2019,liga
1258,0.999864,0.078435,0.522419,4.232541,xgb_4,2018-2019,liga


In [28]:
# pd.concat(metadata_df_list, axis=0, ignore_index=True).to_csv("metadata_perf_recap_regressor_rank_predicter")

In [34]:
recap_df

Unnamed: 0,championship,season,markov_voters,lr_1,lr_2,lr_3,lr_4,lr_5,lr_6,naive,...,xgb_2,xgb_3,xgb_4,fuzzy_umap,xgb_class_umap,xgbrf_class_umap,xgb_class_raw,xgbrf_class_raw,xgb_ranker_umap,xgb_ranker_raw
0,ligue-1,2004-2005,97.25,97.29,96.31,96.69,96.97,96.94,96.53,96.31,...,96.37,97.38,95.09,95.69,88.58,95.29,95.99,96.31,90.36,96.37
1,ligue-2,2004-2005,98.17,98.16,97.25,96.49,92.38,92.46,98.24,97.25,...,97.48,97.95,98.05,97.40,85.96,82.81,96.57,96.35,84.17,91.52
2,serie-A,2004-2005,96.35,94.83,95.86,95.78,95.47,96.11,95.94,95.86,...,95.73,95.83,97.70,94.13,88.39,91.23,97.20,98.74,94.25,96.42
3,bundesliga,2004-2005,97.12,96.64,96.84,97.00,98.00,98.33,96.96,96.84,...,94.72,97.40,96.73,98.33,89.45,92.33,96.50,95.69,94.96,95.10
4,premier-league,2004-2005,99.18,99.08,99.53,99.16,99.69,99.69,99.16,99.53,...,98.86,99.35,99.18,95.01,86.86,93.25,99.25,99.55,97.34,98.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,ligue-2,2018-2019,96.17,97.83,94.62,97.91,98.02,98.02,97.91,94.62,...,94.89,94.45,95.51,77.07,89.32,90.91,93.94,95.63,86.19,95.74
86,serie-A,2018-2019,94.99,98.14,97.88,98.35,98.04,98.14,98.35,97.88,...,98.12,98.06,97.64,82.80,90.16,88.74,98.36,97.97,93.26,95.60
87,bundesliga,2018-2019,97.63,98.27,98.14,97.83,97.03,97.05,97.83,98.14,...,97.33,98.60,98.51,98.83,88.20,88.20,98.09,98.32,88.20,97.11
88,premier-league,2018-2019,98.61,98.63,98.90,99.06,98.71,98.52,99.01,98.90,...,98.44,98.32,98.75,92.89,87.09,87.09,98.84,98.91,87.76,98.38


## Analysis On Recap 

In [14]:
import pandas as pd
from copy import deepcopy

In [15]:
recap_1_df = pd.read_csv("performance_recap_rank_predicter").drop(columns='Unnamed: 0')

In [16]:
recap_1_df

Unnamed: 0,championship,season,markov_voters,lr_1,lr_2,lr_3,lr_4,lr_5,lr_6,naive,...,xgb_2,xgb_3,xgb_4,fuzzy_umap,xgb_class_umap,xgbrf_class_umap,xgb_class_raw,xgbrf_class_raw,xgb_ranker_umap,xgb_ranker_raw
0,ligue-1,2004-2005,97.74,96.90,96.78,96.61,97.11,97.10,96.38,96.78,...,96.40,97.30,95.82,96.23,96.21,88.90,95.72,96.12,93.04,95.37
1,ligue-2,2004-2005,94.05,98.39,97.43,97.82,94.13,94.19,98.39,97.43,...,97.75,97.42,97.54,86.99,84.34,87.03,95.12,97.38,81.64,93.66
2,serie-A,2004-2005,98.23,96.09,97.33,97.45,97.42,97.72,97.55,97.33,...,96.30,96.51,98.37,87.03,87.89,89.58,97.96,99.09,89.58,97.60
3,bundesliga,2004-2005,95.81,96.79,96.71,97.10,98.32,98.49,97.02,96.71,...,96.22,97.05,97.15,81.14,78.24,81.37,96.29,95.99,75.02,96.44
4,premier-league,2004-2005,98.58,98.84,99.13,98.93,99.38,99.38,98.93,99.13,...,98.72,98.63,99.02,78.67,73.44,82.17,98.88,98.88,77.53,98.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,ligue-2,2018-2019,98.27,98.16,94.70,98.19,98.25,98.25,98.19,94.70,...,95.40,94.81,95.96,91.45,90.23,85.85,94.61,96.97,85.85,95.95
86,serie-A,2018-2019,95.37,98.21,98.22,98.57,98.13,98.30,98.57,98.22,...,97.79,98.29,97.36,82.13,82.36,85.98,98.01,98.37,90.04,95.00
87,bundesliga,2018-2019,96.33,97.57,97.13,97.28,96.52,96.55,97.28,97.13,...,97.50,99.02,98.49,75.06,80.98,76.43,97.38,97.38,84.49,96.62
88,premier-league,2018-2019,97.46,97.84,98.12,98.20,97.97,97.81,98.13,98.12,...,97.17,97.06,97.97,88.91,88.40,90.01,97.99,98.31,90.19,97.16


In [17]:
recap_2_df = pd.read_csv("performance_recap_rank_predicter_2_get_bonus_position").drop(columns='Unnamed: 0')

##### Overall average (+ std)

In [18]:
overall_1_df_avg = recap_1_df.mean().to_frame()
overall_1_df_avg.rename(columns={0: 'average_bonus_1'}, inplace=True)

  overall_1_df_avg = recap_1_df.mean().to_frame()


In [19]:
overall_1_df_std = recap_1_df.std().to_frame()
overall_1_df_std.rename(columns={0: 'Std_bonus_1'}, inplace=True)

  overall_1_df_std = recap_1_df.std().to_frame()


In [20]:
overall_2_df_avg = recap_2_df.mean().to_frame()
overall_2_df_avg.rename(columns={0: 'average_bonus_2'}, inplace=True)

  overall_2_df_avg = recap_2_df.mean().to_frame()


In [21]:
overall_2_df_std = recap_2_df.std().to_frame()
overall_2_df_std.rename(columns={0: 'Std_bonus_2'}, inplace=True)

  overall_2_df_std = recap_2_df.std().to_frame()


In [22]:
overall_df_tmp = pd.concat([overall_1_df_avg, overall_2_df_avg, overall_1_df_std, overall_2_df_std], axis=1)
overall_df_tmp

Unnamed: 0,average_bonus_1,average_bonus_2,Std_bonus_1,Std_bonus_2
markov_voters,95.663556,96.060222,2.558001,2.74633
lr_1,97.239333,97.277,2.127974,2.293373
lr_2,96.576778,96.576778,2.278281,2.481117
lr_3,97.356667,97.257222,1.981426,2.476067
lr_4,97.283556,97.223444,1.90305,2.436507
lr_5,97.274444,97.179444,1.858409,2.393249
lr_6,97.436556,97.396333,1.95008,2.271032
naive,96.576778,96.576778,2.278281,2.481117
rf_1,96.783,96.806778,2.433148,2.766219
rf_2,97.011333,96.846333,2.449127,2.633613


In [23]:
idx = [i for i in overall_df_tmp.index if i!='markov_voters']
overall_df = deepcopy(overall_df_tmp.loc[idx,:])

In [24]:
overall_df.sort_values(by='average_bonus_1', ascending=False)

Unnamed: 0,average_bonus_1,average_bonus_2,Std_bonus_1,Std_bonus_2
lr_6,97.436556,97.396333,1.95008,2.271032
lr_3,97.356667,97.257222,1.981426,2.476067
lr_4,97.283556,97.223444,1.90305,2.436507
lr_5,97.274444,97.179444,1.858409,2.393249
lr_1,97.239333,97.277,2.127974,2.293373
rf_2,97.011333,96.846333,2.449127,2.633613
rf_4,96.956778,97.073556,2.83615,2.42799
rf_3,96.904556,96.926889,2.357847,2.483996
rf_1,96.783,96.806778,2.433148,2.766219
xgb_class_raw,96.753444,96.776111,2.031602,2.654644


In [25]:
overall_df.sort_values(by='average_bonus_2', ascending=False)

Unnamed: 0,average_bonus_1,average_bonus_2,Std_bonus_1,Std_bonus_2
lr_6,97.436556,97.396333,1.95008,2.271032
lr_1,97.239333,97.277,2.127974,2.293373
lr_3,97.356667,97.257222,1.981426,2.476067
lr_4,97.283556,97.223444,1.90305,2.436507
lr_5,97.274444,97.179444,1.858409,2.393249
rf_4,96.956778,97.073556,2.83615,2.42799
rf_3,96.904556,96.926889,2.357847,2.483996
rf_2,97.011333,96.846333,2.449127,2.633613
rf_1,96.783,96.806778,2.433148,2.766219
xgb_class_raw,96.753444,96.776111,2.031602,2.654644


#### average per championship (+ std)

In [29]:
cols1 = [c for c in recap_1_df.columns if c!='markov_voters']
piecewise_1_df_avg = recap_1_df.loc[:,cols1].groupby(by=['championship']).mean().transpose()

In [30]:
cols2 = [c for c in recap_2_df.columns if c!='markov_voters']
piecewise_2_df_avg = recap_2_df.loc[:,cols2].groupby(by=['championship']).mean().transpose()

In [31]:
piecewise_1_df_avg.sort_values(by='ligue-1', ascending=False)

championship,bundesliga,liga,ligue-1,ligue-2,premier-league,serie-A
lr_6,98.213333,97.145333,97.069333,96.360667,97.817333,98.013333
lr_4,97.918,97.364667,97.068,95.775333,97.766667,97.808667
lr_3,97.978667,97.086667,97.061333,96.216667,97.778,98.018667
lr_1,98.017333,97.248667,96.983333,95.684,97.655333,97.847333
lr_5,97.824667,97.309333,96.902,95.964,97.792,97.854667
xgb_class_raw,97.286,96.356667,96.739333,95.332667,97.936667,96.869333
rf_4,97.642,97.276,96.72,94.656,97.674,97.772667
xgb_ranker_raw,96.978,97.204667,96.674,94.794,97.04,96.674
rf_2,97.398667,97.29,96.666,95.126667,97.988,97.598667
xgb_2,97.327333,97.046667,96.287333,94.02,97.404667,97.072


In [32]:
piecewise_2_df_avg.sort_values(by='ligue-1', ascending=False)

championship,bundesliga,liga,ligue-1,ligue-2,premier-league,serie-A
lr_3,97.823333,96.87,97.196,95.859333,97.716667,98.078
lr_6,98.218,96.896,97.185333,96.257333,97.767333,98.054
lr_4,97.786667,97.305333,97.128667,95.616,97.642,97.862
lr_1,97.854,97.102667,97.049333,96.024667,97.684,97.947333
lr_5,97.672667,97.208,96.978667,95.696667,97.588667,97.932
xgb_class_raw,97.456,96.552,96.914,94.900667,97.881333,96.952667
xgb_ranker_raw,96.728,97.47,96.706667,94.417333,97.229333,97.044
rf_4,97.497333,97.378,96.503333,95.535333,97.672667,97.854667
rf_2,97.288,97.030667,96.484667,95.444,97.168,97.662667
rf_1,97.562,97.042667,96.409333,94.752,97.288,97.786667


In [None]:
## keep lr_6, xgb_class_raw, xgb_ranker_raw