# Objective

The purpose of this notebook is to build a model able to predict the number of goals a team is going to score
at the next game.

Features are Team plays at home/away, number of goals it scored the last 5 games, during all season so far, the leg, its position, the number of points it has as well as the opponent's rank, number of points, goals conceded last 5 games and number of goals conceded so far.

Then use the probabilities obtained to simulate the last N games of a championship.

In [1]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
from copy import deepcopy

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def rolling_mean_n_performance(df, window=5, performance_col='goals_scored'):
    dg = df.sort_values(by=['leg'])[['season', 'team', performance_col]].groupby(
        by=['season', 'team'])[performance_col].rolling(window=window, min_periods=1).mean().reset_index()
    
    new_col_name = f'rolling_{window}_games_avg_{performance_col}'
    df[new_col_name] = dg.set_index('level_2')[performance_col]
    return df

def get_past_feature(df, feat_col, team=True):

    merge_col = 'team' if team else 'opponent'    
    tmp_df = deepcopy(df[['season', 'leg', merge_col, feat_col]])
    tmp_df.loc[:,'next_leg'] = tmp_df['leg'] +1

    tmp_df.rename(columns={'leg': 'previous_leg', 
                           'next_leg':'leg', 
                           feat_col:f'previous_{merge_col}_{feat_col}'},
                  inplace=True)

    df = df.merge(tmp_df, how='left', on=['leg', 'season', merge_col])
    df.drop(columns=['previous_leg'], inplace=True)
    return df

In [4]:
def prepare_data(csv_path, championship, rolling=5):
    df = pd.read_csv(csv_path).drop(columns='Unnamed: 0')
    df['championship'] = championship
    df['goal_diff'] = df['goals_scored'] - df['goals_conceded']
    # cumulative
    df['cum_pts'] = df[['season', 'team', 'nb_points']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goal_diff'] = df[['season', 'team', 'goal_diff']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_scored'] = df[['season', 'team', 'goals_scored']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_conceded'] = df['cum_goals_scored']-df['cum_goal_diff']
    df['rank'] = df[['season', 'leg', 'cum_pts', 'cum_goal_diff', 'cum_goals_scored']].sort_values(
        by=['cum_pts', 'cum_goal_diff', 'cum_goals_scored'], ascending=False).groupby(
        by=['season', 'leg']).cumcount() + 1
    
    df['avg_goals_scored_since_season_start'] = df['cum_goals_scored'].div(df['leg'])
    df['avg_goals_conceded_since_season_start'] = df['cum_goals_conceded'].div(df['leg'])
    df['avg_cum_pts_since_season_start'] = df['cum_pts'].div(df['leg'])
    
    # removed unwanted useless seasons
    data = deepcopy(df[df.season > '2003-2004'])
    data.reset_index(drop=True, inplace=True)
    
    leg_max = data.leg.max()
    
    end_season = data[data.leg==leg_max].rename(columns={'rank':'final_rank', 'cum_pts': 'final_cum_pts'})
    data = data.merge(end_season[['season', 'team', 'final_rank', 'final_cum_pts']], on=['season', 'team'])
    
    # rolling mean
    cols = ['goals_conceded', 'goals_scored', 'nb_points']
    for c in cols:
        data = rolling_mean_n_performance(df=data, window=rolling, performance_col=c)
    # past features
    past_features = {'rank': [True, False], 
                     'rolling_5_games_avg_goals_scored' : [True],
                     'rolling_5_games_avg_goals_conceded': [False],
                     'avg_goals_scored_since_season_start': [True],
                     'avg_goals_conceded_since_season_start': [False],
                     'goals_scored': [True],
                     'goals_conceded': [False],
                     'rolling_5_games_avg_nb_points': [True, False],
                     'nb_points': [True, False]
                    }
    # print(f'length {len(data)}')
    for col, is_team_ll in past_features.items():
        for is_team in is_team_ll:
            # print(f"is_team ={is_team}, col = {col}")
            data = get_past_feature(df=data, feat_col=col, team=is_team)
    
    return data

In [5]:
def get_pivoted(data: pd.DataFrame, break_leg: int, value_col: str = 'goals_scored'):
    
    df = deepcopy(data[data.leg <= break_leg])
    df.rolling_5_games_avg_nb_points = [y if x!=x else x for x, y in 
           zip(df.rolling_5_games_avg_nb_points, df.avg_cum_pts_since_season_start)]
    
    df_pivot = df.pivot_table(index=['season', 'team'], 
                              columns='leg', 
                              values=[value_col]).reset_index()
    
    df_pivot.columns = [f'leg_{l}' if l!='' else n for n, l in df_pivot.columns]
    
    final = df[['season', 'team', 'final_rank', 'final_cum_pts']].drop_duplicates()
    df_last_leg = df[df.leg==break_leg][['season', 
                                          'team', 
                                          'rank', 
                                          'rolling_5_games_avg_nb_points', 
                                          'avg_cum_pts_since_season_start', 
                                          'cum_pts']].reset_index(drop=True)
    
    df_pivot = df_pivot.merge(df_last_leg, on=['season', 'team'])
    
    return df_pivot.merge(final, on=['season', 'team'])

In [6]:
# hist on goal scored

In [7]:
# Preprocessing

In [8]:
championship_csv = {'ligue-1': 'ligue-1_data_2002_2019',
                   'ligue-2': 'ligue-2_data_2002_2019',
                   'serie-A': 'serie-a_data_2004_2019',
                   'bundesliga': 'bundesliga_data_2004_2019',
                   'premier-league': 'premier-league_data_2004_2019',
                   'liga':'liga_data_2004_2019'}

In [9]:
d# ata_df = prepare_data(championship_csv['ligue-1'], championship='ligue-1')

In [10]:
all_data_dfs = [prepare_data(csv_path=path, championship=champ) for champ, path in championship_csv.items()]

In [11]:
all_data_df = pd.concat(all_data_dfs)

In [12]:
data_exploitable_df = deepcopy(all_data_df[all_data_df.leg > 1]).reset_index()

In [13]:
#data_df.head()

In [14]:
data_exploitable_df.head()

Unnamed: 0,index,country,season,leg,team,play,goals_scored,opponent,goals_conceded,nb_points,...,previous_team_rolling_5_games_avg_goals_scored,previous_opponent_rolling_5_games_avg_goals_conceded,previous_team_avg_goals_scored_since_season_start,previous_opponent_avg_goals_conceded_since_season_start,previous_team_goals_scored,previous_opponent_goals_conceded,previous_team_rolling_5_games_avg_nb_points,previous_opponent_rolling_5_games_avg_nb_points,previous_team_nb_points,previous_opponent_nb_points
0,1,France,2004-2005,2,Lyon,Home,1,Sochaux,1,1,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,3.0,1.0,3.0
1,2,France,2004-2005,3,Lyon,Away,1,Metz,1,1,...,0.5,1.5,0.5,1.5,1.0,2.0,1.0,1.5,1.0,0.0
2,3,France,2004-2005,4,Lyon,Home,1,Lille,0,3,...,0.666667,1.333333,0.666667,1.333333,1.0,0.0,1.0,1.666667,1.0,1.0
3,4,France,2004-2005,5,Lyon,Away,2,Rennes,1,3,...,0.75,1.25,0.75,1.25,1.0,1.0,1.5,1.0,3.0,1.0
4,5,France,2004-2005,6,Lyon,Home,0,Bastia,0,1,...,1.0,1.2,1.0,1.2,2.0,1.0,1.8,1.8,3.0,0.0


# Xgboost Regressor 

Despite we want to predict a number in a small set of possibilities, there is an order between them

In [15]:
from xgboost import XGBRFRegressor, XGBRegressor

In [16]:
from collections import Counter

In [17]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [18]:
from sklearn.tree import DecisionTreeRegressor

In [19]:
from sklearn.linear_model import Ridge, SGDRegressor, PoissonRegressor

In [20]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [23]:
from sklearn.preprocessing import MinMaxScaler

#### data prep

In [24]:
no_use_cols = ['index', 'country', 'season', 'team', 'opponent', 
               'previous_team_rolling_5_games_avg_goals_scored_binned']
usable_cols = [c for c in data_exploitable_df.columns if c not in no_use_cols]
usable_cols = ['leg', 'play'] + [c for c in usable_cols if c.startswith('previous')] + ['goals_scored']

In [25]:
regressor_data_df = deepcopy(data_exploitable_df.loc[:,usable_cols])

In [26]:
regressor_data_df = deepcopy(regressor_data_df[regressor_data_df.goals_scored < 5])

In [27]:
regressor_data_df

Unnamed: 0,leg,play,previous_team_rank,previous_opponent_rank,previous_team_rolling_5_games_avg_goals_scored,previous_opponent_rolling_5_games_avg_goals_conceded,previous_team_avg_goals_scored_since_season_start,previous_opponent_avg_goals_conceded_since_season_start,previous_team_goals_scored,previous_opponent_goals_conceded,previous_team_rolling_5_games_avg_nb_points,previous_opponent_rolling_5_games_avg_nb_points,previous_team_nb_points,previous_opponent_nb_points,goals_scored
0,2,Home,12.0,2.0,0.000000,1.000000,0.000000,1.000000,0.0,1.0,1.0,3.000000,1.0,3.0,1
1,3,Away,14.0,8.0,0.500000,1.500000,0.500000,1.500000,1.0,2.0,1.0,1.500000,1.0,0.0,1
2,4,Home,15.0,4.0,0.666667,1.333333,0.666667,1.333333,1.0,0.0,1.0,1.666667,1.0,1.0,1
3,5,Away,6.0,13.0,0.750000,1.250000,0.750000,1.250000,1.0,1.0,1.5,1.000000,3.0,1.0,2
4,6,Home,4.0,5.0,1.000000,1.200000,1.000000,1.200000,2.0,1.0,1.8,1.800000,3.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64405,34,Away,1.0,18.0,2.000000,1.600000,2.484848,1.484848,2.0,2.0,2.2,0.600000,3.0,1.0,2
64406,35,Home,1.0,8.0,2.000000,2.400000,2.470588,1.352941,2.0,4.0,2.2,0.600000,3.0,0.0,1
64407,36,Away,1.0,12.0,1.400000,1.000000,2.428571,1.114286,1.0,0.0,2.6,0.600000,3.0,1.0,0
64408,37,Home,1.0,18.0,1.000000,1.200000,2.361111,1.416667,0.0,2.0,2.0,0.600000,0.0,0.0,2


In [28]:
# encode play variable
regressor_data_df['play'] = [1 if p=='Home' else 0 for p in regressor_data_df.play]

In [29]:
# split train-test-valid : 0.8x.95-0.2x.95-0.05
features_data = regressor_data_df.drop('goals_scored', axis=1).values
target_data = regressor_data_df['goals_scored'].values

In [30]:
mm_scaler = MinMaxScaler(feature_range=(0,100))

In [31]:
scale = False

In [32]:
if scale:
    feature_data = mm_scaler.fit_transform(features_data)

In [33]:
X_model, X_valid, y_model, y_valid = train_test_split(features_data, 
                                                      target_data, 
                                                      test_size=0.05, 
                                                      random_state=42,
                                                      stratify=target_data
                                                     )

X_train, X_test, y_train, y_test = train_test_split(X_model,
                                                    y_model,
                                                    test_size=.2,
                                                    random_state=61,
                                                    stratify=y_model
                                                   )

In [None]:
oversample = False

In [34]:
oversampler = SMOTE(random_state=42) #RandomOverSampler()

In [35]:
if oversample:
    X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)
else:
    X_train_over, y_train_over = X_train, y_train

In [36]:
len(y_train_over)

83785

In [37]:
len(y_train)

48198

In [38]:
Counter(y_test)

Counter({2: 2670, 0: 3542, 1: 4189, 3: 1206, 4: 443})

#### Models 

In [39]:
def get_model(model_name:str, X, y):
    """
    model_name: str: name of the model to be used. MUST be in 
    ['xgboost', 'random_forest', 'decision_tree', 'extra_tree']
    X: array-like : feature
    y: array-like : target
    
    returns: fitted model 
    """

In [40]:
xgb_reg = XGBRegressor()

In [41]:
rf_reg = RandomForestRegressor(random_state=42)

In [42]:
extra_tree_reg = ExtraTreesRegressor(random_state=42)

In [43]:
decision_tree_reg = DecisionTreeRegressor(random_state=42)

In [48]:
#help(PoissonRegressor)

In [47]:
ridge_reg = Ridge(random_state=42)
sgd_reg = SGDRegressor(random_state=42)
poisson_reg = PoissonRegressor()

In [45]:
knn_reg = KNeighborsRegressor()
mlp_reg = MLPRegressor(random_state=42)

In [49]:
xgb_reg.fit(X=X_train_over, y=y_train_over)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [50]:
rf_reg.fit(X=X_train_over, y=y_train_over)

RandomForestRegressor(random_state=42)

In [51]:
extra_tree_reg.fit(X=X_train_over, y=y_train_over)

ExtraTreesRegressor(random_state=42)

In [52]:
decision_tree_reg.fit(X=X_train_over, y=y_train_over)

DecisionTreeRegressor(random_state=42)

In [53]:
sgd_reg.fit(X=X_train_over, y=y_train_over)
ridge_reg.fit(X=X_train_over, y=y_train_over)
poisson_reg.fit(X=X_train_over, y=y_train_over)

PoissonRegressor()

In [54]:
knn_reg.fit(X=X_train_over, y=y_train_over)
mlp_reg.fit(X=X_train_over, y=y_train_over)

MLPRegressor(random_state=42)

In [55]:
xgb_predictions = xgb_reg.predict(X_test)

In [56]:
rf_predictions = rf_reg.predict(X_test)

In [57]:
extra_tree_predictions = extra_tree_reg.predict(X_test)

In [58]:
decision_tree_predictions = decision_tree_reg.predict(X_test)

In [59]:
sgd_predictions = sgd_reg.predict(X_test)
ridge_predictions = ridge_reg.predict(X_test)
poisson_predictions = poisson_reg.predict(X_test)

In [60]:
knn_predictions = knn_reg.predict(X_test)
mlp_predictions = mlp_reg.predict(X_test)

In [61]:
def to_closer_integer_A(x):
    return math.ceil(int(2*max(x,0))/2)

def to_closer_integer_B(x):
    return math.floor(max(x,0))

In [62]:
def predictions_to_scores_A(X):
    return np.array([to_closer_integer_A(x) for x in X])

def predictions_to_scores_B(X):
    return np.array([to_closer_integer_B(x) for x in X])

In [63]:
# Counter(xgb_predictions)

In [64]:
# Counter(y_test)

In [65]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [66]:
naive_predictions = np.array([1]*len(y_test))

In [67]:
confusion_matrix(y_true=y_test, y_pred=naive_predictions)

array([[   0, 3542,    0,    0,    0],
       [   0, 4189,    0,    0,    0],
       [   0, 2670,    0,    0,    0],
       [   0, 1206,    0,    0,    0],
       [   0,  443,    0,    0,    0]])

In [68]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(xgb_predictions))

array([[  44, 2903,  589,    6,    0],
       [  40, 3197,  943,    9,    0],
       [  22, 1901,  736,   11,    0],
       [   2,  777,  415,   12,    0],
       [   4,  240,  190,    9,    0]])

In [69]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(xgb_predictions))

array([[1177, 2304,   61,    0,    0],
       [1152, 2900,  135,    2,    0],
       [ 563, 1969,  135,    3,    0],
       [ 190,  925,   89,    2,    0],
       [  57,  340,   44,    2,    0]])

In [70]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(rf_predictions))

array([[  17, 2736,  778,   11,    0],
       [  19, 2986, 1160,   24,    0],
       [  12, 1732,  901,   25,    0],
       [   4,  680,  495,   27,    0],
       [   2,  207,  227,    7,    0]])

In [71]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(rf_predictions))

array([[ 939, 2508,   94,    1,    0],
       [ 855, 3135,  198,    1,    0],
       [ 460, 2012,  197,    1,    0],
       [ 140,  943,  121,    2,    0],
       [  39,  334,   70,    0,    0]])

In [72]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(extra_tree_predictions))

array([[  38, 2726,  762,   16,    0],
       [  34, 3004, 1121,   30,    0],
       [  27, 1771,  833,   38,    1],
       [   6,  682,  494,   23,    1],
       [   3,  215,  211,   13,    1]])

In [73]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(extra_tree_predictions))

array([[ 970, 2448,  120,    4,    0],
       [ 919, 3045,  222,    3,    0],
       [ 477, 2012,  177,    4,    0],
       [ 155,  921,  125,    5,    0],
       [  48,  323,   70,    1,    1]])

In [74]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(decision_tree_predictions))

array([[1165, 1183,  680,  380,  134],
       [1255, 1362,  877,  486,  209],
       [ 751,  837,  611,  318,  153],
       [ 316,  359,  282,  160,   89],
       [ 112,  122,  115,   58,   36]])

In [75]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(decision_tree_predictions))

array([[1169, 1184,  677,  378,  134],
       [1259, 1363,  873,  485,  209],
       [ 752,  839,  609,  317,  153],
       [ 317,  359,  282,  159,   89],
       [ 112,  122,  115,   58,   36]])

In [76]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(sgd_predictions))

array([[   5, 1872, 1631,   34,    0],
       [   1, 1801, 2309,   78,    0],
       [   1,  926, 1669,   73,    1],
       [   0,  337,  815,   54,    0],
       [   0,  101,  311,   31,    0]])

In [77]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(sgd_predictions))

array([[ 436, 2641,  462,    3,    0],
       [ 381, 3054,  749,    5,    0],
       [ 178, 1877,  612,    3,    0],
       [  47,  806,  348,    5,    0],
       [  14,  251,  174,    4,    0]])

In [78]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(ridge_predictions))

array([[   0, 1049, 2288,  204,    1],
       [   0,  904, 2881,  402,    2],
       [   0,  430, 1866,  373,    1],
       [   0,  134,  840,  231,    1],
       [   0,   31,  287,  125,    0]])

In [79]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(ridge_predictions))

array([[  68, 2336, 1127,   11,    0],
       [  60, 2397, 1702,   30,    0],
       [  27, 1305, 1304,   33,    1],
       [   6,  479,  687,   34,    0],
       [   1,  148,  271,   23,    0]])

In [80]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(knn_predictions))

array([[ 212, 1435, 1201,  544,  150],
       [ 279, 1608, 1449,  674,  179],
       [ 166,  919,  968,  475,  142],
       [  50,  400,  423,  253,   80],
       [  21,  135,  164,   91,   32]])

In [81]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(knn_predictions))

array([[ 711, 1509,  898,  336,   88],
       [ 787, 1778, 1122,  374,  128],
       [ 459, 1087,  748,  287,   89],
       [ 164,  471,  375,  147,   49],
       [  55,  168,  147,   50,   23]])

In [82]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(mlp_predictions))

array([[  42, 1696, 1674,  130,    0],
       [  32, 1773, 2118,  260,    6],
       [  14,  948, 1471,  231,    6],
       [   9,  324,  708,  158,    7],
       [   2,   93,  264,   80,    4]])

In [83]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(mlp_predictions))

array([[ 512, 2373,  640,   17,    0],
       [ 454, 2712,  981,   42,    0],
       [ 219, 1671,  724,   55,    1],
       [  72,  685,  416,   33,    0],
       [  24,  210,  184,   25,    0]])

In [84]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_A(poisson_predictions))

array([[   0,  265, 3205,   72,    0],
       [   0,  222, 3806,  161,    0],
       [   0,   91, 2429,  150,    0],
       [   0,   33, 1056,  117,    0],
       [   0,    7,  370,   66,    0]])

In [85]:
confusion_matrix(y_true=y_test, y_pred=predictions_to_scores_B(poisson_predictions))

array([[   0, 2395, 1147,    0,    0],
       [   0, 2506, 1683,    0,    0],
       [   0, 1434, 1235,    1,    0],
       [   0,  556,  650,    0,    0],
       [   0,  158,  285,    0,    0]])

In [86]:
accuracy_score(y_true=y_test, y_pred=naive_predictions)

0.34763485477178424

In [87]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(xgb_predictions))

0.33103734439834026

In [88]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(xgb_predictions))

0.34970954356846473

In [89]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(rf_predictions))

0.3262240663900415

In [90]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(rf_predictions))

0.3546058091286307

In [91]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(extra_tree_predictions))

0.32356846473029044

In [92]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(extra_tree_predictions))

0.3483817427385892

In [93]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(decision_tree_predictions))

0.2766804979253112

In [94]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(decision_tree_predictions))

0.27684647302904564

In [95]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(ridge_predictions))

0.24904564315352698

In [96]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(ridge_predictions))

0.3156016597510373

In [97]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(sgd_predictions))

0.29286307053941907

In [98]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(sgd_predictions))

0.3408298755186722

In [99]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(knn_predictions))

0.25502074688796683

In [100]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(knn_predictions))

0.28273858921161826

In [101]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(mlp_predictions))

0.2861410788381743

In [102]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(mlp_predictions))

0.3303734439834025

In [103]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_A(poisson_predictions))

0.22970954356846474

In [104]:
accuracy_score(y_true=y_test, y_pred=predictions_to_scores_B(poisson_predictions))

0.3104564315352697

In [None]:
#for c, fi in zip(usable_cols, xgb_reg.feature_importances_):
#    print(f"importance of {c} : {fi}")

In [None]:
#for c, fi in zip(usable_cols,rf_reg.feature_importances_):
#    print(f"importance of {c} : {fi}")

In [None]:
#for c, fi in zip(usable_cols,extra_tree_reg.feature_importances_):
#    print(f"importance of {c} : {fi}")