In [62]:
import sqlite3
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype
from datetime import datetime as dt

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

con = sqlite3.connect("../fifa_api_server/fifa.db")
cur = con.cursor()
qry = cur.execute("select * from players where year = '2021';")
df_raw = pd.DataFrame(qry.fetchall(), columns=[i[0] for i in qry.description])

keep_cols = ["sofifa_id", "age", "height_cm", "weight_kg", "nationality", "club_name", "league_name",
             "overall", "potential", "value_eur", "wage_eur", "player_positions", "preferred_foot",
             "international_reputation", "weak_foot", "skill_moves", "work_rate",
             "release_clause_eur", "team_position", "team_jersey_number", "loaned_from",
             "year", "short_name", "dob"]

df = df_raw.copy()[keep_cols]
df["unique_id"] = df.apply(lambda x: str(
    x["sofifa_id"]) + "_" + str(x["year"]), axis=1)
df = df.drop(["sofifa_id"], axis=1)

In [63]:
# FEATURE ENGINEERING

class DataPrep:

    def __init__(self, raw_df):
        self.df = raw_df.copy()
        self.is_processed = False
        self.base_cols = self.df.columns
        self.leading_cols = ["unique_id", "short_name", "overall"]
        self.drop_cols = ["work_rate", "season_start", "loaned_from",
                          "dob", "release_clause_eur", "year"]

    def process_data(self):
        self._convert_floats()
        self._unpack_workrate()
        self._get_all_positions()
        self._get_parent_club_league()
        self._calculate_age_days()
        self._calculate_min_release()
        self._drop_unnecessary()

    def get_metadata(self, df):
        return pd.DataFrame({
            "colname": [i for i in df.columns],
            "count": [len(df[i]) for i in df.columns],
            "nulls": [df[i].isnull().sum() for i in df.columns],
            "dtype": [infer_dtype(df[i]) if i != "unique_id" else "unique_id" for i in df.columns],
            "unique": [len(df[i].unique()) for i in df.columns],
            "source": ["base" if i in self.base_cols else "new" for i in df.columns]
        })

    def _convert_floats(self):
        meta = self.get_metadata(self.df)
        for col in meta[meta["dtype"] == "floating"]["colname"]:
            self.df[col] = self.df[col].astype(int)

    def _unpack_workrate(self):
        self.df["workrate_def"] = self.df["work_rate"].apply(
            lambda x: x[:x.index("/")].strip())
        self.df["workrate_att"] = self.df["work_rate"].apply(
            lambda x: x[x.index("/")+1:].strip())

    def _get_all_positions(self):
        positions = []
        unpackable = [i.replace(",", "").split(" ")
                      for i in self.df["player_positions"].unique()]
        for i in unpackable:
            positions.extend(i)
        for position in list(set(positions)):
            self.df[f"pos_{position}"] = self.df["player_positions"].apply(
                lambda x: 1 if position in x else 0)

    def _get_parent_club_league(self):
        clubs_leagues = self.df[["club_name", "league_name"]].drop_duplicates()
        clubs_leagues = {k: v for k, v in zip(
            clubs_leagues["club_name"], clubs_leagues["league_name"])}

        self.df["club_name_parent"] = self.df.apply(
            lambda x: x["club_name"] if x["loaned_from"] == "NA" else x["loaned_from"], axis=1)
        self.df["league_name_parent"] = self.df.apply(
            lambda x: x["league_name"] if x["loaned_from"] == "NA" else clubs_leagues[x["loaned_from"]], axis=1)
        self.df = self.df.rename(columns={"club_name": "club_name_season",
                                          "league_name": "league_name_season"})

    def _calculate_age_days(self):
        self.df["season_start"] = self.df["year"].apply(
            lambda x: dt(x-1, 10, 1))
        self.df["age"] = self.df.apply(lambda x: (
            x["season_start"] - dt.strptime(x["dob"], "%Y-%m-%d")).days, axis=1)

    def _calculate_min_release(self):
        def get_min_release_value(x):
            if x["has_release_clause"] == 0:
                return int(x["value_eur"])
            return max(int(x["value_eur"]), int(x["release_clause_eur"]))
        self.df["has_release_clause"] = self.df["release_clause_eur"].apply(
            lambda x: 0 if (pd.isnull(x) or x == "nan") else 1)
        self.df["release_value_min"] = self.df.apply(
            lambda x: get_min_release_value(x), axis=1)

    def _drop_unnecessary(self):
        self.df = self.df.drop(self.drop_cols, axis=1)

    def get_processed_data(self):
        if self.is_processed == False:
            self.process_data()
            self.is_processed = True
        return self.df[self.leading_cols + [i for i in self.df.columns if i not in self.leading_cols]]


# get enrichment data for clubs and leagues (2020/21 season)

In [64]:
Prepper = DataPrep(df)
df_use = Prepper.get_processed_data()
df_use.head()

Unnamed: 0,unique_id,short_name,overall,age,height_cm,weight_kg,nationality,club_name_season,league_name_season,potential,value_eur,wage_eur,player_positions,preferred_foot,international_reputation,weak_foot,skill_moves,team_position,team_jersey_number,loaned_from,workrate_def,workrate_att,pos_GK,pos_CB,pos_RWB,pos_RB,pos_CF,pos_CAM,pos_CDM,pos_LWB,pos_ST,pos_RW,pos_LB,pos_CM,pos_LW,pos_RM,pos_LM,club_name_parent,league_name_parent,has_release_clause,release_value_min
0,158023_2021,L. Messi,93,12153,170,72,Argentina,FC Barcelona,Spain Primera Division,93,67500000,560000,"RW, ST, CF",Left,5,4,4,CAM,10,,Medium,Low,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,FC Barcelona,Spain Primera Division,1,138400000
1,20801_2021,Cristiano Ronaldo,92,13022,187,83,Portugal,Juventus,Italian Serie A,92,46000000,220000,"ST, LW",Right,5,4,5,LS,7,,High,Low,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,Juventus,Italian Serie A,1,75900000
2,200389_2021,J. Oblak,91,10129,188,87,Slovenia,Atlético Madrid,Spain Primera Division,93,75000000,125000,GK,Right,3,3,1,GK,13,,Medium,Medium,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Atlético Madrid,Spain Primera Division,1,159400000
3,188545_2021,R. Lewandowski,91,11729,184,80,Poland,FC Bayern München,German 1. Bundesliga,91,80000000,240000,ST,Right,4,4,4,ST,9,,High,Medium,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,FC Bayern München,German 1. Bundesliga,1,132000000
4,190871_2021,Neymar Jr,91,10466,175,68,Brazil,Paris Saint-Germain,French Ligue 1,91,90000000,270000,"LW, CAM",Right,5,5,5,LW,10,,High,Medium,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,Paris Saint-Germain,French Ligue 1,1,166500000


In [65]:
meta = Prepper.get_metadata(df_use)
meta

Unnamed: 0,colname,count,nulls,dtype,unique,source
0,unique_id,18719,0,unique_id,18719,base
1,short_name,18719,0,string,17683,base
2,overall,18719,0,integer,47,base
3,age,18719,0,integer,6208,base
4,height_cm,18719,0,integer,50,base
5,weight_kg,18719,0,integer,56,base
6,nationality,18719,0,string,161,base
7,club_name_season,18719,0,string,681,new
8,league_name_season,18719,0,string,52,new
9,potential,18719,0,integer,48,base
