In [41]:
from modelHandler import ModelHandler
import pandas as pd  
import numpy as np 
import sklearn as sk
from sklearn.metrics import mean_squared_error

In [42]:
df = pd.read_csv('../../data/Correlation01.csv', index_col = 0)

In [43]:
df = df.drop('age', axis = 1)

## Group by Position

Partiendo de la hipotesis que goles y asistencias tienen más peso para delanteros y mediocentros que para defensas y obviamente que para porteros, la división de estos grupos es fundamental.

Creemos que es interesante realizar un modelo para cada posición del campo de futbol. Elegimos realizar una primera aproximación con modelos para:
- Portero
- Defensa i mediocentro defensivo
- Mediocentros y bandas
- Delanteros y extremos

Otra configuración podria ser juntar mediocentros y delanteros. 

In [44]:
### group positon
tre = { 'Goalkeeper' : 0,
    'DefensiveMidfield': 1,'Defender' : 1,
 'LeftMidfield':2 , 'CentralMidfield':2, 'Midfielder':2, 'AttackingMidfield':2,'RightMidfield' : 2,
   'SecondStriker':3, 'Forward':3,'LeftWinger':3,'RightWinger' : 3}

df['position'] = df['position'].apply(lambda x: tre[x])

In [45]:
df.describe()

Unnamed: 0,price,yellow_card_champ,yellow_card_cup,goal_continent,assist_continent,yellow_card_continent,score_goal_cup,score_goal_champ,score_goals_selection,score_assist_cup,score_assist_champ,score_selections_nation,league,position
count,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0,8854.0
mean,1612051.0,14.754009,0.857353,0.479105,0.459002,0.602778,0.863564,11.556472,0.580416,0.703185,9.018071,5.707364,37.212333,1.684098
std,5881306.0,17.824661,1.992781,2.19087,1.823347,1.887254,2.187195,18.585831,2.522602,1.857391,14.890667,14.846162,18.112519,1.040544
min,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,100000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,1.0
50%,300000.0,8.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,38.0,2.0
75%,800000.0,21.0,1.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,12.0,4.0,50.0,2.0
max,100000000.0,142.0,26.0,121.0,44.0,28.0,44.0,423.0,85.0,23.0,174.0,176.0,70.0,3.0


## values

## Model handler

# For each position

In [12]:
def model_for_position(df):
    pos_mod = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        pos_mod[p] = ModelHandler(X = tmp.drop(target, axis = 1).to_numpy(), Y = tmp[target].to_numpy(), model = 'XGB',scale = True)
    return pos_mod

In [13]:
models = model_for_position(df)

In [14]:
models

{0: <modelHandler.ModelHandler at 0x7f6d4016a790>,
 3: <modelHandler.ModelHandler at 0x7f6d4016aa30>,
 2: <modelHandler.ModelHandler at 0x7f6d4016a160>,
 1: <modelHandler.ModelHandler at 0x7f6d4016ac40>}

In [None]:
for model in models.values():
    model.fit(with_score = False) #Perform grid search

In [30]:
models[0].grid.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.001,
 'max_depth': 24,
 'n_estimators': 1000,
 'subsample': 1.0}

### Manage Models

In [16]:
from xgboost import XGBRegressor
def model_for_position(df, model):
    pos_mod = {}
    pos_data_train = {}
    pos_data_test = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        # If grid seach not loaded
        # models[p].grid.best_params_['n_estimators'] = 1000
        # models[p].grid.best_params_['max_depth'] = 64
        pos_mod[p] = model #**models[p].grid.best_params_)
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(tmp.drop(target, axis = 1).to_numpy(), tmp[target].to_numpy(), test_size=0.20, random_state=42)
        pos_data_train[p] = (X_train, y_train)
        pos_data_test[p] = (X_test, y_test)

    return pos_mod, pos_data_train, pos_data_test

In [18]:
model = XGBRegressor(**{'colsample_bytree': 0.5, 'learning_rate': 0.001, 'max_depth': 24,  'n_estimators': 1000, 'subsample': 1.0})
pos_mod, pos_data_train, pos_data_test = model_for_position(df, model)

In [12]:
def train(pos_mod, pos_data_train, pos_data_test):
    for k in pos_mod:
        pos_mod[k].fit(pos_data_train[k][0], pos_data_train[k][1])
        print(f"INFO class {k} mse on train: {mean_squared_error(pos_data_train[k][1], pos_mod[k].predict(pos_data_train[k][0]))}")
        print(f"INFO class {k} mse on test: {mean_squared_error(pos_data_test[k][1], pos_mod[k].predict(pos_data_test[k][0]))}")

In [13]:
train(pos_mod, pos_data_train, pos_data_test)

INFO class 0 mse on train: 9965957257941.889
INFO class 0 mse on test: 6754333251486.802
INFO class 3 mse on train: 12028063849628.305
INFO class 3 mse on test: 49681830846759.44
INFO class 2 mse on train: 9884622560645.37
INFO class 2 mse on test: 56099944494044.32
INFO class 1 mse on train: 8249697824338.068
INFO class 1 mse on test: 8106842116507.781


In [14]:
for t,p in zip(pos_data_test[0][1], pos_mod[0].predict(pos_data_test[0][0])):
    print(t,p, t-p)

50000.0 196756.34 -146756.34375
600000.0 354469.53 245530.46875
500000.0 133412.23 366587.765625
25000.0 170705.67 -145705.671875
300000.0 319298.5 -19298.5
250000.0 643993.94 -393993.9375
250000.0 662337.5 -412337.5
250000.0 315584.84 -65584.84375
300000.0 684464.8 -384464.8125
75000.0 162048.83 -87048.828125
700000.0 1906578.8 -1206578.75
300000.0 372779.0 -72779.0
350000.0 865861.3 -515861.3125
150000.0 204899.7 -54899.703125
50000.0 679557.4 -629557.375
75000.0 894564.1 -819564.125
50000.0 139824.4 -89824.40625
100000.0 142497.95 -42497.953125
175000.0 243333.0 -68333.0
1000000.0 670227.4 329772.625
150000.0 481006.75 -331006.75
700000.0 283440.47 416559.53125
400000.0 217128.31 182871.6875
600000.0 438914.0 161086.0
100000.0 126659.03 -26659.03125
500000.0 1467166.6 -967166.625
150000.0 103349.86 46650.140625
50000.0 417618.34 -367618.34375
3000000.0 3554537.8 -554537.75
50000.0 176012.55 -126012.546875
800000.0 965455.2 -165455.1875
300000.0 494241.62 -194241.625
1000000.0 105131

### Test with DT

In [15]:
from sklearn.tree import DecisionTreeRegressor

In [19]:
model = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=64, min_samples_split=2, min_samples_leaf=1)

In [20]:
pos_mod, pos_data_train, pos_data_test = model_for_position(df, model)

In [21]:
train(pos_mod, pos_data_train, pos_data_test)

INFO class 0 mse on train: 7130723021.140611
INFO class 0 mse on test: 43124108570772.06
INFO class 3 mse on train: 95587519.47040497
INFO class 3 mse on test: 30743319379543.094
INFO class 2 mse on train: 46201661.00997635
INFO class 2 mse on test: 54138425809352.516
INFO class 1 mse on train: 5899406.68824164
INFO class 1 mse on test: 15924212569669.184


### Test with

In [33]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [39]:
def model_for_position_poly(df, model, n_poly = 2):
    pos_mod = {}
    pos_data_train = {}
    pos_data_test = {}
    pos = list(df.position.unique())
    target = 'price'
    for p in pos:
        tmp = df[df.position == p]
        poly = PolynomialFeatures(n_poly)
        st = StandardScaler()
        pos_mod[p] = model #**models[p].grid.best_params_)
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(tmp.drop(target, axis = 1).to_numpy(), tmp[target].to_numpy(), test_size=0.20, random_state=42)
        poly.fit(X_train)
        pos_data_train[p] = (st.fit_transform(poly.transform(X_train)), y_train)
        pos_data_test[p] = (st.transform(poly.transform(X_test)), y_test)

    return pos_mod, pos_data_train, pos_data_test

In [40]:
model = XGBRegressor(**{'colsample_bytree': 0.5, 'learning_rate': 0.001, 'max_depth': 24,  'n_estimators': 1000, 'subsample': 1.0})
pos_mod, pos_data_train, pos_data_test = model_for_position_poly(df, model, n_poly = 2)
train(pos_mod, pos_data_train, pos_data_test)

INFO class 0 mse on train: 6491549806388.867
INFO class 0 mse on test: 7558064324654.384
INFO class 3 mse on train: 11041690005134.723
INFO class 3 mse on test: 37739925580903.84
INFO class 2 mse on train: 8698130856623.22
INFO class 2 mse on test: 52233263774268.42
INFO class 1 mse on train: 7280109291773.837
INFO class 1 mse on test: 7649572722679.017
