In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV

In [2]:
players = pd.read_csv("players.csv")
squads = pd.read_csv("squads.csv")

In [3]:
df = players.join(squads.set_index(["squad_name", "year"]), how="left", on=["squad_name", "year"], lsuffix="_player", rsuffix="_squad")

In [4]:
df.describe()

Unnamed: 0,age,value_player,year,games_played,goals,assists,minute_played,goals_conceded,clean_sheets,avg_age,squad_value,number_players
count,42118.0,42118.0,42118.0,42118.0,37417.0,37417.0,42118.0,4701.0,4701.0,42118.0,42118.0,42118.0
mean,24.390142,5631461.0,2015.430695,24.164015,2.973675,2.125157,1730.675151,22.058073,4.928951,25.343775,205265100.0,37.57771
std,4.834412,10908450.0,3.373213,12.595201,4.612372,2.923134,1063.986822,19.439744,5.220464,1.105396,206525200.0,7.883551
min,-1.0,-1.0,2010.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,18400000.0,21.0
25%,20.0,500000.0,2013.0,15.0,0.0,0.0,887.0,5.0,1.0,24.6,71500000.0,33.0
50%,24.0,2000000.0,2015.0,25.0,1.0,1.0,1682.0,18.0,3.0,25.3,123030000.0,36.0
75%,28.0,5500000.0,2018.0,34.0,4.0,3.0,2508.0,36.0,8.0,26.0,254330000.0,41.0
max,43.0,200000000.0,2021.0,75.0,73.0,35.0,6498.0,91.0,32.0,30.1,1200000000.0,88.0


In [5]:
any(df['championship_squad'] != df['championship_player'])

False

Since there are no values where player plays in different championship than squad, we drop the column:

In [6]:
df = players.join(squads.drop('championship', axis=1).set_index(["squad_name", "year"]), how="left", on=["squad_name", "year"])

Define the preprocessing transformations

For numeric:
- Impute values using the median (this may not be the best aproach)
- Scale the data with a StandardScaler

For categorical variables we perform a OneHotEncoding (ignoring unknowns in case there is a malformed call to the model)

Finally, we use CountVectorizer on text variables (player name)

In [7]:
numeric_transformer = Pipeline(
    steps=[
            ("imputer", SimpleImputer(strategy="median", missing_values=-1)),
            ("scaler", StandardScaler()),
          ]
)

# If a category has low frequency on training, label it as infrequent, when running
# the model a category is not found it will be asigned as infrequent or ignored.
categorical_transformer = OneHotEncoder(
    handle_unknown="infrequent_if_exist",
    min_frequency=0.001,
)

categorical_columns = ["squad_name", "role", "championship"]

column_trans = make_column_transformer(
    (CountVectorizer(), "name_player"),
    (categorical_transformer, categorical_columns),
    remainder=numeric_transformer
)

In [8]:
df.role = df.role.astype("object")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42118 entries, 0 to 42117
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name_player     42118 non-null  object 
 1   age             42118 non-null  int64  
 2   role            42118 non-null  object 
 3   value_player    42118 non-null  int64  
 4   squad_name      42118 non-null  object 
 5   year            42118 non-null  int64  
 6   games_played    42118 non-null  int64  
 7   goals           37417 non-null  float64
 8   assists         37417 non-null  float64
 9   minute_played   42118 non-null  int64  
 10  championship    42118 non-null  object 
 11  goals_conceded  4701 non-null   float64
 12  clean_sheets    4701 non-null   float64
 13  avg_age         42118 non-null  float64
 14  squad_value     42118 non-null  int64  
 15  number_players  42118 non-null  int64  
dtypes: float64(5), int64(7), object(4)
memory usage: 5.1+ MB


Simple XGBoost on R2 with a train test split of 8:2

In [9]:
clf = Pipeline(
    steps=[("preprocessor", column_trans), ("classifier", XGBRegressor(objective='reg:squarederror'))]
)
clf

Preliminary results

In [10]:
X = df.drop("value_player", axis=1)
y = df.value_player

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

ValueError: Input X contains NaN.
SimpleImputer does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
len(clf.steps[0][1].get_feature_names_out())

In [None]:
scores = cross_val_score(clf, X, y)
(-scores)

## Parameter tunning

We have the following parameters to adjust:

In [None]:
clf.named_steps["classifier"].get_params()

In [None]:
params = {
    #"booster" : ("gbtree", "gblinear"),
    "eta" : [0.01, 0.1],
    "gamma": [0, 0.1, 0.2],
    "max_depth": [5,6,7,8],
}

# Build new pipeline with cross validation (reusing all the preprocessing, since we won't be touching that
gcv = Pipeline(
    steps=[("preprocessor", clf.named_steps["preprocessor"]),
           ("classifier", GridSearchCV(clf.named_steps["classifier"], params))]
)

In [None]:
gcv.fit(X_train, y_train)

In [None]:
gcv.named_steps["classifier"].best_params_, gcv.named_steps["classifier"].best_score_

In [None]:
gcv.named_steps["classifier"].cv_results_

In [None]:
player_data = dict(X_test.iloc[0,:])

gcv.predict(
pd.DataFrame.from_dict({k: [v] for k, v in player_data.items()})
)[0]

In [None]:
dict(X_test.iloc[0,:])

In [None]:
y_test.iloc[0]

In [None]:
gcv.score(X_test, y_test)