In [10]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.base import clone
from pickle import dump

In [11]:
## Load the dataset
df = pd.read_csv('../data/dataset/dataset.csv')
df.head()

Unnamed: 0,1st_team,2nd_team,year,net_score,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd
0,Norway,Poland,2014,-3.0,192.0,78.0,23.0,72.0,75.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Australia,Ecuador,2014,-1.0,182.0,84.0,22.0,73.0,79.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Austria,Uruguay,2014,0.0,194.0,85.0,30.0,64.0,64.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Czech Republic,Norway,2014,0.0,196.0,90.0,32.0,85.0,85.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,England,Denmark,2014,1.0,196.0,91.0,27.0,82.0,83.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df.tail()

Unnamed: 0,1st_team,2nd_team,year,net_score,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd
1116,Canada,Uruguay,2022,-2.0,195.0,84.0,34.0,75.0,75.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1117,Ecuador,Japan,2022,0.0,195.0,81.0,35.0,74.0,74.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1118,Iran,Senegal,2022,0.0,194.0,85.0,29.0,74.0,75.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1119,Saudi Arabia,United States,2022,0.0,185.0,79.0,30.0,71.0,71.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1120,Portugal,Spain,2022,-1.0,190.0,84.0,34.0,82.0,82.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
## Drop the columns that are not needed
df.drop("year", axis=1, inplace=True)

In [14]:
## Split the dataset into train and test
from sklearn.model_selection import train_test_split

X = df.drop('net_score', axis=1)
y = df['net_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [15]:
le = preprocessing.LabelEncoder()

countries_set = set(list(X["2nd_team"]) + list(X["1st_team"]))

countries_encoder = le.fit(list(countries_set))
dump(countries_encoder, open('../models/encoders/countries_encoder.pkl', 'wb'))
X_train["2nd_team"] = countries_encoder.transform(X_train["2nd_team"])
X_train["1st_team"] = countries_encoder.transform(X_train["1st_team"])

X_test["2nd_team"] = countries_encoder.transform(X_test["2nd_team"])
X_test["1st_team"] = countries_encoder.transform(X_test["1st_team"])

In [8]:
## Apply PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pca = PCA(n_components=0.9)
scaler = StandardScaler()



In [12]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

estimators_dict = {
RandomForestRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__max_depth": [None, 3, 5, 7], "est__oob_score": [True], "est__min_samples_split":[0.05, 0.15, 0.3, 0.5] },
AdaBoostRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__learning_rate": [0.01, 0.05, 0.1, 0.5, 1], "est__loss": ["linear", "square", "exponential"] },
GradientBoostingRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__learning_rate": [0.01, 0.05, 0.1, 0.5, 1], "est__loss": ["squared_error", "absolute_error", "huber", "quantile"], "est__max_depth": [None, 3, 5, 7] },
Ridge(): {"est__alpha": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] },
Lasso(): {"est__alpha": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__max_iter": [1000, 2000, 5000, 10000]},
SVR(): {"est__kernel": ["linear", "poly", "rbf", "sigmoid"], "est__C": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__gamma": ["scale", "auto"] },
KNeighborsRegressor(): {"est__n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21], "est__weights": ["uniform", "distance"], "est__algorithm": ["auto", "ball_tree", "kd_tree", "brute"], "est__leaf_size": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

}

In [13]:
from sklearn.model_selection import GridSearchCV

def get_best_estimator(estimator, params, cv):
    grid = GridSearchCV(estimator, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_


In [14]:
### Get tunned pipeline
from sklearn.model_selection import cross_validate, KFold

scorers = ["neg_mean_absolute_error", "r2", "neg_mean_absolute_percentage_error"]
train_mae = "train_neg_mean_absolute_error"
test_mae = "test_neg_mean_absolute_error"

train_r2 = "train_r2"
test_r2 = "test_r2"

train_mape = "train_neg_mean_absolute_percentage_error"
test_mape = "test_neg_mean_absolute_percentage_error"
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# valid_results = pd.DataFrame(columns=["estimator", "params", "train_MAE", "test_MAE", "train_r2", 
#                                       "test_r2", "train_MAPE", "test_MAPE"])    

                                      
for est, params in estimators_dict.items():
    print("Getting best estimator for {}".format(est))
    pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('est', est)])
    tunned_estimator = get_best_estimator(pipeline, params, cv)

    
    scores = cross_validate(tunned_estimator, X_train, y_train, cv=cv, return_train_score=True, scoring=scorers, n_jobs=-1)
    train_mae_score = np.mean(scores[train_mae]*-1)
    test_mae_score = np.mean(scores[test_mae]*-1)
    train_r2_score = np.mean(scores[train_r2])
    test_r2_score = np.mean(scores[test_r2])
    train_mape_score = np.mean(scores[train_mape]*-1)
    test_mape_score = np.mean(scores[test_mape]*-1)
    valid_results = valid_results.append({"estimator": tunned_estimator, "params": tunned_estimator.get_params()["est"],
                                            "train_MAE": train_mae_score, "test_MAE": test_mae_score,
                                            "train_r2": train_r2_score, "test_r2": test_r2_score,
                                            "train_MAPE": train_mape_score, "test_MAPE": test_mape_score}, ignore_index=True)

valid_results

Getting best estimator for Lasso()


  valid_results = valid_results.append({"estimator": tunned_estimator, "params": tunned_estimator.get_params()["est"],


Getting best estimator for SVR()


  valid_results = valid_results.append({"estimator": tunned_estimator, "params": tunned_estimator.get_params()["est"],


Getting best estimator for KNeighborsRegressor()


  valid_results = valid_results.append({"estimator": tunned_estimator, "params": tunned_estimator.get_params()["est"],


Unnamed: 0,estimator,params,train_MAE,test_MAE,train_r2,test_r2,train_MAPE,test_MAPE
0,"(StandardScaler(), PCA(n_components=0.9), (Dec...","(DecisionTreeRegressor(max_depth=5, max_featur...",1.147661,1.250366,0.28124,0.13087,603413400000000.0,678858000000000.0
1,"(StandardScaler(), PCA(n_components=0.9), (Dec...","(DecisionTreeRegressor(max_depth=3, random_sta...",1.168691,1.248225,0.269991,0.137606,594638200000000.0,650158100000000.0
2,"(StandardScaler(), PCA(n_components=0.9), ([De...",([DecisionTreeRegressor(criterion='friedman_ms...,1.051752,1.259501,0.381813,0.1219,514699700000000.0,659086300000000.0
3,"(StandardScaler(), PCA(n_components=0.9), Ridg...","Ridge(alpha=100, solver='saga')",1.195415,1.238859,0.220627,0.145267,730346200000000.0,756377500000000.0
4,"(StandardScaler(), PCA(n_components=0.9), Lass...",Lasso(alpha=0.1),1.194883,1.235084,0.218293,0.150655,696579700000000.0,722861600000000.0
5,"(StandardScaler(), PCA(n_components=0.9), SVR(...","SVR(C=100, kernel='linear')",1.188232,1.24488,0.207423,0.143653,638947000000000.0,707084700000000.0
6,"(StandardScaler(), PCA(n_components=0.9), KNei...","KNeighborsRegressor(algorithm='ball_tree', lea...",1.26329,1.334465,0.122025,0.01683,461356800000000.0,502869800000000.0


In [98]:
## Retrain Estimators and save
# X["1st_team"] = countries_encoder.transform(X["1st_team"])
# X["2nd_team"] = countries_encoder.transform(X["2nd_team"])

for model in valid_results["estimator"].values:
    # params = valid_results[valid_results["estimator"] == model]["params"][0]
    model = clone(model)
    print(model)
    model.fit(X, y)
    dump(model, open("../models/regression/{}.pkl".format(model["est"].__class__.__name__), "wb"))
    


Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.9)),
                ('est',
                 RandomForestRegressor(max_depth=5, min_samples_split=0.15,
                                       n_estimators=300, oob_score=True))])
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.9)),
                ('est',
                 AdaBoostRegressor(learning_rate=0.01, loss='exponential',
                                   n_estimators=500))])
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.9)),
                ('est',
                 GradientBoostingRegressor(learning_rate=0.01, loss='huber',
                                           n_estimators=300))])
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.9)),
                ('est', Ridge(alpha=100, solver='saga'))])
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.9)),
                ('est', Lasso(alpha=0.1))])
Pipeline(st

In [73]:
mask = (df["year"] == 2022) & (df["1st_team"] == "Argentina")
df[mask]


Unnamed: 0,1st_team,2nd_team,year,net_score,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd


In [45]:
test_row = pd.DataFrame(columns=X.columns)

test_row["2nd_team"] = ["Saudi Arabia"]
test_row["1st_team"] = "Argentina"

for col in df[mask].drop(columns=["1st_team", "2nd_team", "year", "net_score"]).columns:
    if col.find("_1st") != -1:
        test_row[col] = df[mask][col].values[0]

In [46]:
test_row

Unnamed: 0,1st_team,2nd_team,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,GK_attacking_finishing_1st,GK_attacking_heading_accuracy_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd
0,Saudi Arabia,Argentina,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
mask_saudi = (df["year"] == 2022) & (df["1st_team"] == "Saudi Arabia") 


for col in df[mask_saudi].drop(columns=["1st_team", "2nd_team", "year", "net_score"]).columns:
    if col.find("_1st") != -1:
        test_row[col] = df[mask_saudi][col].values[0]

In [49]:
test_row["1st_team"] = countries_encoder.transform(test_row["1st_team"])
test_row["2nd_team"] = countries_encoder.transform(test_row["2nd_team"])

In [53]:
valid_results.sort_values("test_MAE")["estimator"][0].predict(test_row)

array([-0.70898323])

In [67]:
flags_dict = {
    'Argentina': '🇦🇷',
    'Portugal': '🇵🇹',
    'Ecuador': '🇪🇨',
    'Netherlands': '🇳🇱',
    'Brazil': '🇧🇷',
    'England': '🏴󠁧󠁢󠁥󠁮󠁧󠁿',
    'Iran': '🇮🇷',
    'United States': '🇺🇸',
    'Wales': '🏴󠁧󠁢󠁷󠁬󠁳󠁿',
    'Mexico': '🇲🇽',
    'Poland': '🇵🇱',
    'France': '🇫🇷',
    'Australia': '🇦🇺',
    'Denmark': '🇩🇰',
    'Tunisia': '🇹🇳',
    'Costa Rica': '🇨🇷',
    'Germany': '🇩🇪',
    'Japan': '🇯🇵',
    'Korea Republic': '🇰🇷',
    'Croatia': '🇭🇷',
    'Canada': '🇨🇦',
    'Morocco': '🇲🇦',
    'Serbia': '🇷🇸',
    'Switzerland': '🇨🇭',
    'Cameroon': '🇨🇲',
    'Ghana': '🇬🇭',
    'Uruguay': '🇺🇾',
    'Saudi Arabia': '🇸🇦',
    'Senegal': '🇸🇳',
    'Spain': '🇪🇸',
    'Qatar': '🇶🇦',
    'Belgium': '🇧🇪',
}

In [68]:
for team in flags_dict.keys():
    print(f"----------{team}----------")
    mask =  (df["year"] == 2022) & (df["1st_team"] == team)
    first = df[mask]
    mask =  (df["year"] == 2022) & (df["2nd_team"] == team)
    second = df[mask]
    print(first.shape)
    print(second.shape)


----------Argentina----------
(0, 2208)
(1, 2208)
----------Portugal----------
(2, 2208)
(2, 2208)
----------Ecuador----------
(3, 2208)
(2, 2208)
----------Netherlands----------
(5, 2208)
(3, 2208)
----------Brazil----------
(2, 2208)
(2, 2208)
----------England----------
(2, 2208)
(1, 2208)
----------Iran----------
(2, 2208)
(0, 2208)
----------United States----------
(2, 2208)
(5, 2208)
----------Wales----------
(3, 2208)
(3, 2208)
----------Mexico----------
(4, 2208)
(0, 2208)
----------Poland----------
(3, 2208)
(3, 2208)
----------France----------
(2, 2208)
(2, 2208)
----------Australia----------
(1, 2208)
(1, 2208)
----------Denmark----------
(3, 2208)
(3, 2208)
----------Tunisia----------
(0, 2208)
(2, 2208)
----------Costa Rica----------
(2, 2208)
(1, 2208)
----------Germany----------
(1, 2208)
(2, 2208)
----------Japan----------
(5, 2208)
(2, 2208)
----------Korea Republic----------
(0, 2208)
(0, 2208)
----------Croatia----------
(2, 2208)
(2, 2208)
----------Canada----------

In [64]:
players = pd.read_csv("../data/players_data/players_all_prepared.csv")

In [70]:
players[players["year"] == 2023]["nationality_name"].unique()

array(['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran',
       'United States', 'Wales', 'Argentina', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia',
       'Costa Rica', 'Germany', 'Spain', 'Japan', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Korea Republic'],
      dtype=object)