In [2]:
import sqlite3
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 200)

In [3]:
def get_data():
    keep_cols = [
        # NOT AVAILABLE AT COLLECTION
        "overall",
        # MANDATORY AT COLLECTION
        "dob", "nationality", "value_eur", "work_rate", "height_cm", "weight_kg", "club_name",
        # OPTIONAL AT COLLECTION - GENERATED THROUGH INGESTION
        "sofifa_id", "release_clause_eur", "loaned_from", "short_name", "preferred_foot", "weak_foot",
        "player_positions", "international_reputation", "skill_moves", "team_jersey_number"
    ]
    select_cols = ", ".join(keep_cols)
    with sqlite3.connect("../fifa_api_server/fifa.db") as con:
        cur = con.cursor()
        qry = cur.execute(
            f"select {select_cols} from players where year = '2021';")
        df = pd.DataFrame(qry.fetchall(), columns=[
                          i[0] for i in qry.description])
    return df


class Transformer:

    def __init__(self, clubs_leagues, positions):
        self.clubs_leagues = clubs_leagues
        self.positions = positions
        self.leading_cols = ["short_name", "overall"]
        self.drop_cols = ["work_rate", "season_start", "loaned_from",
                          "dob", "release_clause_eur", "player_positions"]

    # INTERNAL METHODS - prepare a

    def get_processed_data(self, raw_df):
        self.df = raw_df.copy()
        self._populate_missing()
        self._get_league_names()
        self._convert_floats()
        self._unpack_workrate()
        self._get_all_positions()
        self._get_parent_club_league()
        self._calculate_age_days()
        self._calculate_min_release()
        self._drop_unnecessary()
        self._set_index()
        return self.df

    def _populate_missing(self):
        pass

    def _convert_floats(self):
        meta = self.get_metadata(self.df)
        for col in meta[meta["dtype"] == "floating"]["colname"]:
            self.df[col] = self.df[col].astype(int)

    def _get_league_names(self):
        self.df["league_name"] = self.df["club_name"].apply(
            lambda x: self.clubs_leagues[x])

    def _unpack_workrate(self):
        self.df["workrate_def"] = self.df["work_rate"].apply(
            lambda x: x[:x.index("/")].strip())
        self.df["workrate_att"] = self.df["work_rate"].apply(
            lambda x: x[x.index("/")+1:].strip())

    def _get_all_positions(self):

        for position in self.positions:
            self.df[f"pos_{position}"] = self.df["player_positions"].apply(
                lambda x: 1 if position in x else 0)

    def _get_parent_club_league(self):

        self.df["club_name_parent"] = self.df.apply(
            lambda x: x["club_name"] if x["loaned_from"] == "NA" else x["loaned_from"], axis=1)
        self.df["league_name_parent"] = self.df.apply(
            lambda x: x["league_name"] if x["loaned_from"] == "NA" else self.clubs_leagues[x["loaned_from"]], axis=1)
        self.df = self.df.rename(columns={"club_name": "club_name_season",
                                          "league_name": "league_name_season"})

    def _calculate_age_days(self):
        self.df["season_start"] = dt(2020, 10, 1)
        self.df["age"] = self.df.apply(lambda x: (
            x["season_start"] - dt.strptime(x["dob"], "%Y-%m-%d")).days, axis=1)

    def _calculate_min_release(self):
        def get_min_release_value(x):
            if x["has_release_clause"] == 0:
                return int(x["value_eur"])
            return max(int(x["value_eur"]), int(x["release_clause_eur"]))
        self.df["has_release_clause"] = self.df["release_clause_eur"].apply(
            lambda x: 0 if (pd.isnull(x) or x == "nan") else 1)
        self.df["release_value_min"] = self.df.apply(
            lambda x: get_min_release_value(x), axis=1)

    def _drop_unnecessary(self):
        self.df = self.df.drop(self.drop_cols, axis=1)

    def _set_index(self):
        self.df = self.df.set_index("sofifa_id")

    def get_metadata(self, df):
        return pd.DataFrame({
            "colname": [i for i in df.columns],
            "count": [len(df[i]) for i in df.columns],
            "nulls": [df[i].isnull().sum() for i in df.columns],
            "dtype": [infer_dtype(df[i]) for i in df.columns],
            "unique": [len(df[i].unique()) for i in df.columns],
        })

    def get_transformations(self, df):
        meta = self.get_metadata(df)
        special_cols = {"short_name": "drop", "club_name_season": "drop",
                        "club_name_parent": "drop", "overall": "target"}
        mapping = {"integer": "scl", "string": "ohe"}
        meta["action"] = meta.apply(
            lambda x: special_cols[x["colname"]] if x["colname"] in
            list(special_cols.keys()) else mapping[x["dtype"]], axis=1)
        transformations = {
            "cols_drp": meta[meta["action"] == "drop"]["colname"].values,
            "cols_scl": meta[meta["action"] == "scl"]["colname"].values,
            "cols_ohe": meta[meta["action"] == "ohe"]["colname"].values,
            "target": meta[meta["action"] == "target"]["colname"].values
        }
        return transformations

    def apply_transformations(self, df, transformations, SCL, OHE):
        X_scl = pd.DataFrame(
            SCL.transform(df[transformations["cols_scl"]]),
            columns=[i.replace(" ", "_").strip() for i in SCL.get_feature_names_out()], index=df.index)
        X_ohe = pd.DataFrame(
            OHE.transform(df[transformations["cols_ohe"]]).toarray(),
            columns=[i.replace(" ", "_").strip() + "_scl" for i in OHE.get_feature_names_out()], index=df.index)
        X_out = pd.concat([X_scl, X_ohe], axis=1)
        return X_out

# get enrichment data for clubs and leagues (2020/21 season)

In [24]:
player_data = get_data()
player_data["value_eur"] = np.random.randint(0, 100, len(player_data))
player_data["release_clause_eur"] = np.random.randint(0, 100, len(player_data))
position_data = pd.read_csv("positions.csv")
club_data = pd.read_csv("club_data.csv")
clubs_leagues = {k: v for k, v in zip(
    club_data["club_name"], club_data["league_name"])}
tf_args = {"clubs_leagues": clubs_leagues,
           "positions": position_data["position"].values}
df1, df2 = train_test_split(player_data, test_size=0.001)

holdout_df = df2.drop(["overall"], axis=1)
answers_df = df2["overall"]

In [25]:
TF = Transformer(**tf_args)
Objs = {"transformations": "", "SCL": "", "OHE": ""}

In [26]:
df = TF.get_processed_data(df1)

Objs["transformations"] = TF.get_transformations(df)

X = df.drop(Objs["transformations"]["cols_drp"], axis=1)
y = df[Objs["transformations"]["target"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)

Objs["SCL"] = StandardScaler().fit(
    X_train[Objs["transformations"]["cols_scl"]])
Objs["OHE"] = OneHotEncoder(handle_unknown="ignore").fit(
    X_train[Objs["transformations"]["cols_ohe"]])

X_train_use = TF.apply_transformations(X_train, **Objs)
X_test_use = TF.apply_transformations(X_test, **Objs)

Model = RandomForestRegressor(n_estimators=10).fit(X_train_use, y_train)

  Model = RandomForestRegressor(n_estimators=10).fit(X_train_use, y_train)


In [27]:
TF_New = Transformer(**tf_args)

df_processed = TF_New.get_processed_data(holdout_df).drop(
    Objs["transformations"]["cols_drp"], axis=1)
df_transformed = TF_New.apply_transformations(df_processed, **Objs)
holdout_score = Model.score(df_transformed, answers_df)

print(holdout_score)

0.22746794871794818


In [20]:
feat_imp = pd.DataFrame({
    "feature": Model.feature_names_in_,
    "importance": Model.feature_importances_*100
}).sort_values("importance", ascending=False)

feat_imp

Unnamed: 0,feature,importance
24,release_value_min,79.875138
22,age,14.998087
23,has_release_clause,1.507282
5,skill_moves,0.391262
6,team_jersey_number,0.297523
2,weight_kg,0.277028
0,value_eur,0.236092
1,height_cm,0.212142
4,international_reputation,0.168456
198,league_name_season_English_League_Two_scl,0.132603
