In [153]:
from modelHandler import ModelHandler
import pandas as pd  
import numpy as np 
import sklearn as sk
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [154]:
df = pd.read_csv('../../data/Correlation05.csv', index_col = 0)

## Group by Position

Partiendo de la hipotesis que goles y asistencias tienen más peso para delanteros y mediocentros que para defensas y obviamente que para porteros, la división de estos grupos es fundamental.

Creemos que es interesante realizar un modelo para cada posición del campo de futbol. Elegimos realizar una primera aproximación con modelos para:
- Portero
- Defensa i mediocentro defensivo
- Mediocentros y bandas
- Delanteros y extremos

Otra configuración podria ser juntar mediocentros y delanteros. 

In [156]:
### group positon
# Positions grouoped 
tre = { 'Goalkeeper' : 0,
    'DefensiveMidfield': 1,'Defender' : 1,
 'LeftMidfield':2 , 'CentralMidfield':2, 'Midfielder':2, 'AttackingMidfield':2,'RightMidfield' : 2,
   'SecondStriker':3, 'Forward':3,'LeftWinger':3,'RightWinger' : 3}

#Columns to drop for each class, futher explained in presentation
cols_to_drop = {0: ['score_goal_cup', 'goal_continent', 'assist_continent', 
       'score_goal_champ', 'score_goals_selection', 'score_assist_cup',
       'score_assist_champ','age','position', 'yellow_card_champ', 'yellow_card_cup', 'yellow_card_continent', 'league', 'own_goal_champ'], 1: ['score_goal_cup', 'goal_continent', 'assist_continent', 
       'score_goal_champ', 'score_goals_selection', 'score_assist_cup',
       'score_assist_champ','age','position', 'league', 'own_goal_champ'], 2: None, 3: ['position', 'league', 'red_card_continent', 'own_goal_champ', 'yellow_card_champ',
       'second_yellow_card_champ', 'red_card_champ', 'yellow_card_cup','own_goal_continent',
       'yellow_card_continent', 'second_yellow_card_continent', ]}

df['position'] = df['position'].apply(lambda x: tre[x])

In [157]:
df.describe()

Unnamed: 0,red_card_continent,price,own_goal_champ,yellow_card_champ,second_yellow_card_champ,red_card_champ,yellow_card_cup,goal_continent,assist_continent,own_goal_continent,...,second_yellow_card_continent,score_goal_cup,score_goal_champ,score_goals_selection,score_assist_cup,score_assist_champ,score_selections_nation,age,position,code_league
count,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,...,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0
mean,0.246103,1612051.0,0.098938,14.754009,0.424215,0.407499,0.857353,0.479105,0.459002,0.008019,...,0.973119,1.131522,15.196973,0.580416,0.923537,12.026372,5.707364,26.045968,1.684098,1.54145
std,1.843374,5881306.0,0.36125,17.824661,0.918985,0.81993,1.992781,2.19087,1.823347,0.095316,...,6.366863,3.131815,27.757299,2.522602,2.612913,22.397119,14.846162,4.564612,1.040544,0.744291
min,0.0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0
25%,0.0,100000.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,1.0,1.0
50%,0.0,300000.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,3.0,0.0,26.0,2.0,2.0
75%,0.0,800000.0,0.0,21.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,19.0,0.0,0.0,14.0,4.0,29.0,2.0,2.0
max,59.0,100000000.0,6.0,142.0,9.0,8.0,26.0,121.0,44.0,2.0,...,195.0,88.0,846.0,85.0,44.0,348.0,176.0,46.0,3.0,2.0


## values

## Model handler

### For each position

Model to grid search in xgboost model. Performed in google colab, stored in this notebook just top hyperparmeters.

In [12]:
def model_for_position(df):
    pos_mod = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        pos_mod[p] = ModelHandler(X = tmp.drop(target, axis = 1).to_numpy(), Y = tmp[target].to_numpy(), model = 'XGB',scale = True)
    return pos_mod

In [13]:
models = model_for_position(df)

In [14]:
models

{0: <modelHandler.ModelHandler at 0x7f6d4016a790>,
 3: <modelHandler.ModelHandler at 0x7f6d4016aa30>,
 2: <modelHandler.ModelHandler at 0x7f6d4016a160>,
 1: <modelHandler.ModelHandler at 0x7f6d4016ac40>}

In [None]:
for model in models.values():
    model.fit(with_score = False) #Perform grid search

In [30]:
models[0].grid.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.001,
 'max_depth': 24,
 'n_estimators': 1000,
 'subsample': 1.0}

## Manage Models

The data set is grouped by playing postions : goal keeper, defense, med and front and a model is trained for each of these.

In the exposition we will explain futher this division. The code below allows us to train different models for each class automatically.

In [16]:
def model_for_position(df, model):
    pos_mod = {}
    pos_data_train = {}
    pos_data_test = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        pos_mod[p] = model 
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(tmp.drop(target, axis = 1).to_numpy(), tmp[target].to_numpy(), test_size=0.20, random_state=42)
        pos_data_train[p] = (X_train, y_train)
        pos_data_test[p] = (X_test, y_test)

    return pos_mod, pos_data_train, pos_data_test

In [18]:
model = XGBRegressor(**{'colsample_bytree': 0.5, 'learning_rate': 0.001, 'max_depth': 24,  'n_estimators': 1000, 'subsample': 1.0}) #parameters from grid_search
pos_mod, pos_data_train, pos_data_test = model_for_position(df, model)

In [58]:
def train(pos_mod, pos_data_train, pos_data_test):
    for k in pos_mod:
        pos_mod[k].fit(pos_data_train[k][0], pos_data_train[k][1])
        print(f"INFO class {k} r2_score on train: {r2_score(pos_data_train[k][1], pos_mod[k].predict(pos_data_train[k][0]))}")
        print(f"INFO class {k} r2_score on test: {r2_score(pos_data_test[k][1], pos_mod[k].predict(pos_data_test[k][0]))}")

In [59]:
train(pos_mod, pos_data_train, pos_data_test)

INFO class 0 mse on train: 0.6400827423972291
INFO class 0 mse on test: 0.06123617287855354
INFO class 3 mse on train: 0.7240723783322792
INFO class 3 mse on test: 0.14084215687552815
INFO class 2 mse on train: 0.73250647465481
INFO class 2 mse on test: 0.2439396927447519
INFO class 1 mse on train: 0.7081271512177367
INFO class 1 mse on test: 0.08763043865793296


## Model DT regressor for each class

In [61]:
from sklearn.tree import DecisionTreeRegressor

In [62]:
model = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=64, min_samples_split=2, min_samples_leaf=1)

In [63]:
pos_mod, pos_data_train, pos_data_test = model_for_position(df, model)

In [64]:
train(pos_mod, pos_data_train, pos_data_test)

INFO class 0 mse on train: 0.999644998557037
INFO class 0 mse on test: -3.529385895708625
INFO class 3 mse on train: 0.9999976443159838
INFO class 3 mse on test: 0.43067712817053794
INFO class 2 mse on train: 0.9999986335145186
INFO class 2 mse on test: 0.1397921555403735
INFO class 1 mse on train: 0.9999997733950506
INFO class 1 mse on test: -0.8644040147024907


## Test with Polynomial features and SS

In [65]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [66]:
def model_for_position_poly(df, model, n_poly = 2):
    pos_mod = {}
    pos_data_train = {}
    pos_data_test = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        poly = PolynomialFeatures(n_poly)
        st = StandardScaler()
        pos_mod[p] = model #**models[p].grid.best_params_)
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(tmp.drop(target, axis = 1).to_numpy(), tmp[target].to_numpy(), test_size=0.20, random_state=42)
        poly.fit(X_train)
        pos_data_train[p] = (st.fit_transform(poly.transform(X_train)), y_train)
        pos_data_test[p] = (st.transform(poly.transform(X_test)), y_test)

    return pos_mod, pos_data_train, pos_data_test

In [None]:
model = XGBRegressor(**{'colsample_bytree': 0.5, 'learning_rate': 0.001, 'max_depth': 24,  'n_estimators': 1000, 'subsample': 1.0})
pos_mod, pos_data_train, pos_data_test = model_for_position_poly(df, model, n_poly = 2)
train(pos_mod, pos_data_train, pos_data_test)

## CatBooster Regressor Model for each class

In [158]:
from catboost import CatBoostRegressor

In [159]:
def model_for_position(df, model):
    pos_mod = {}
    pos_data_train = {}
    pos_data_test = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        pos_mod[p] = model 
        if p != 2:
            tmp = tmp.drop(cols_to_drop[p],axis = 1) # try
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(tmp.drop(target, axis = 1).to_numpy(), tmp[target].to_numpy(), test_size=0.20, random_state=42)
        pos_data_train[p] = (X_train, y_train)
        pos_data_test[p] = (X_test, y_test)

    return pos_mod, pos_data_train, pos_data_test

def train(pos_mod, pos_data_train, pos_data_test):
    for k in pos_mod:
        pos_mod[k].fit(pos_data_train[k][0], pos_data_train[k][1])
        print(f"INFO class {k} r2_score on train: {r2_score(pos_data_train[k][1], pos_mod[k].predict(pos_data_train[k][0]))}")
        print(f"INFO class {k} r2_score on test: {r2_score(pos_data_test[k][1], pos_mod[k].predict(pos_data_test[k][0]))}")

In [160]:
model = CatBoostRegressor(iterations=10000,
                          learning_rate=1e-3,
                          depth=12, nan_mode = 'Min', verbose = 0, l2_leaf_reg = 3, border_count = 254,  random_strength = 1, max_ctr_complexity = 2) # loss_function = 'rmse',

In [161]:
pos_mod, pos_data_train, pos_data_test = model_for_position(df, model)
train(pos_mod, pos_data_train, pos_data_test)
#error on print, error is r2_score

INFO class 0 mse on train: 0.951875042281766
INFO class 0 mse on test: 0.44174851994972053
INFO class 3 mse on train: 0.9698938412490564
INFO class 3 mse on test: 0.3844712015481895


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=17]="AzadeganLeague": Cannot convert 'b'AzadeganLeague'' to float