In [122]:
import sqlite3
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

con = sqlite3.connect("../fifa_api_server/fifa.db")
cur = con.cursor()
qry = cur.execute("select * from players;")
df_raw = pd.DataFrame(qry.fetchall(), columns=[i[0] for i in qry.description])

keep_cols = ["sofifa_id", "age", "height_cm", "weight_kg", "nationality", "club_name", "league_name", "league_rank", "overall", "potential", "value_eur", "wage_eur",
             "player_positions", "preferred_foot", "international_reputation", "weak_foot", "skill_moves", "work_rate", "body_type", "release_clause_eur",
             "team_position", "team_jersey_number", "loaned_from", "nation_position", "year",
             "short_name", "dob", "has_release_clause"]

df = df_raw[keep_cols]
df["unique_id"] = df.apply(lambda x: str(
    x["sofifa_id"]) + "_" + str(x["year"]), axis=1)
df = df.drop(["sofifa_id"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["unique_id"] = df.apply(lambda x: str(


In [123]:
def get_metadata(df):
    return pd.DataFrame({
        "colname": [i for i in df.columns],
        "count": [len(df[i]) for i in df.columns],
        "nulls": [df[i].isnull().sum() for i in df.columns],
        "dtype": [infer_dtype(df[i]) if i != "unique_id" else "unique_id" for i in df.columns],
        "unique": [len(df[i].unique()) for i in df.columns]
    })


meta = get_metadata(df)

meta.sort_values(["dtype", "colname"])

Unnamed: 0,colname,count,nulls,dtype,unique
6,league_rank,121272,0,floating,5
20,team_jersey_number,121272,0,floating,99
0,age,121272,0,integer,33
1,height_cm,121272,0,integer,55
13,international_reputation,121272,0,integer,5
7,overall,121272,0,integer,55
8,potential,121272,0,integer,54
15,skill_moves,121272,0,integer,5
9,value_eur,121272,0,integer,272
10,wage_eur,121272,0,integer,155


In [153]:
# FEATURE ENGINEERING

from datetime import datetime as dt
df_use = df.copy()

for col in meta[meta["dtype"] == "floating"]["colname"]:
    df_use[col] = df_use[col].astype(int)

df_use["workrate_def"] = df_use["work_rate"].apply(
    lambda x: x[:x.index("/")].strip())
df_use["workrate_att"] = df_use["work_rate"].apply(
    lambda x: x[x.index("/")+1:].strip())


def get_all_positions(df, column, positions=[]):
    unpackable = [i.replace(",", "").split(" ") for i in df[column].unique()]
    for i in unpackable:
        positions.extend(i)
    for position in list(set(positions)):
        df[f"pos_{position}"] = df[column].apply(
            lambda x: 1 if position in x else 0)
    return df


df_use = get_all_positions(df=df_use, column="player_positions")

df_use["season_start"] = df_use["year"].apply(lambda x: dt(x-1, 10, 1))
df_use["age"] = df_use.apply(lambda x: (
    x["season_start"] - dt.strptime(x["dob"], "%Y-%m-%d")).days, axis=1)


def get_min_release_value(x):
    if (pd.isnull(x["release_clause_eur"]) or x["release_clause_eur"] == "nan"):
        return int(x["value_eur"])
    return max(int(x["value_eur"]), int(x["release_clause_eur"]))


df_use["release_value_min"] = df_use.apply(
    lambda x: get_min_release_value(x), axis=1)

# get club league position per year (json / csv stored externally and joined in)
# extend to loaned_from and standardise into "season_club" and "parent_club"
# same as above for "season_league" and "parent_league"

df_use = df_use.drop(["work_rate", "season_start", "dob",
                     "release_clause_eur"], axis=1)

In [154]:

df_use.head()

Unnamed: 0,age,height_cm,weight_kg,nationality,club_name,league_name,league_rank,overall,potential,value_eur,wage_eur,player_positions,preferred_foot,international_reputation,weak_foot,skill_moves,body_type,release_clause_eur,team_position,team_jersey_number,loaned_from,joined,nation_position,year,short_name,has_release_clause,unique_id,workrate_def,workrate_att,pos_CF,pos_LM,pos_RM,pos_CM,pos_RWB,pos_LWB,pos_CAM,pos_LW,pos_GK,pos_CDM,pos_LB,pos_RW,pos_CB,pos_RB,pos_ST,release_value_min
48741,11926,185,80,Portugal,Real Madrid,Spain Primera Division,1,94,94,95500000,575000,"LW, ST",Right,5,4,5,C. Ronaldo,195800000.0,LW,7,,2009-07-01,LS,2018,Cristiano Ronaldo,Yes,20801_2018,High,Low,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,195800000
48742,11057,170,72,Argentina,FC Barcelona,Spain Primera Division,1,93,93,105000000,575000,RW,Left,5,4,4,Messi,215300000.0,RW,10,,2004-07-01,RW,2018,L. Messi,Yes,158023_2018,Medium,Medium,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,215300000
48743,9370,175,68,Brazil,Paris Saint-Germain,French Ligue 1,1,92,94,123000000,275000,LW,Right,5,5,5,Neymar,236800000.0,LW,10,,2017-08-03,LW,2018,Neymar Jr,Yes,190871_2018,High,Medium,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,236800000
48744,11511,193,92,Germany,FC Bayern München,German 1. Bundesliga,1,92,92,61000000,225000,GK,Right,5,4,1,Normal,100700000.0,GK,1,,2011-07-01,GK,2018,M. Neuer,Yes,167495_2018,Medium,Medium,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,100700000
48745,11208,182,86,Uruguay,FC Barcelona,Spain Primera Division,1,92,92,97000000,500000,ST,Right,5,4,4,Normal,198900000.0,ST,9,,2014-07-11,LS,2018,L. Suárez,Yes,176580_2018,High,Medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,198900000
