In [157]:
import pandas as pd
import numpy as np
from pandas.api.types import infer_dtype
import sqlite3
from tqdm import tqdm
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

class FIFA_Processing:
    
    def __init__(self, database_name, table_name, process_from_scratch):

        self.database_name  = database_name
        self.table_name     = table_name
        self.from_scratch   = process_from_scratch

        self.sysdata   = ["sofifa_id",  'real_face', 'player_url', 'year']
        self.constants = ['short_name','long_name', 'dob']
        self.variables = ["value_eur", 'nation_position', 'club_name', 'potential', 'wage_eur', 'international_reputation',
                          'release_clause_eur', 'loaned_from', 'weight_kg', 'league_name', 'contract_valid_until', 'player_positions',
                          'height_cm', 'preferred_foot', 'team_position', 'work_rate', 'joined', 'age', 'body_type',
                          'team_jersey_number', 'league_rank', 'nation_jersey_number', 'nationality', 'player_tags', 'player_traits']
        self.abilities = ['overall', 'shooting', 'dribbling', 'mentality_positioning', 'goalkeeping_positioning', 'goalkeeping_diving',
                          'goalkeeping_kicking', 'gk_speed', 'goalkeeping_handling', 'skill_moves', 'skill_ball_control',
                          'defending', 'gk_positioning', 'gk_reflexes', 'attacking_volleys', 'weak_foot', 'power_long_shots',
                          'movement_reactions', 'movement_balance', 'defending_standing_tackle', 'gk_diving', 'movement_acceleration',
                          'attacking_finishing', 'movement_sprint_speed', 'skill_curve', 'mentality_composure', 'skill_dribbling',
                          'mentality_aggression', 'attacking_heading_accuracy', 'gk_handling', 'power_shot_power', 'mentality_vision',
                          'goalkeeping_reflexes', 'passing', 'mentality_interceptions', 'pace', 'skill_long_passing', 'defending_marking',
                          'power_strength', 'defending_sliding_tackle', 'attacking_short_passing', 'physic', 'mentality_penalties',
                          'gk_kicking', 'skill_fk_accuracy', 'power_stamina',  'movement_agility', 'power_jumping', 'attacking_crossing']
        self.positions = ['cf','lb','ls','lm','rs','cb','rf','st','rm','lf','cm','rw','rb','lw',
                          'lcb','cam','rdm','lcm','ldm','rcm','rcb','rwb','lwb','ram','lam','cdm']
        self.to_remove = ["player_traits", "player_tags"]
        self.semantics = {}
        for i in "sysdata", "constants", "variables", "positions", "abilities", :
            for k in eval(f"self.{i}"):
                self.semantics[k] = i
        
        if self.from_scratch == True:
            self.import_data()
            self.data_quality_fix()
            self.apply_tags()
            self.apply_traits()
            self.replace_nulls()
            self.upload_to_database()
        
    def get_metadata(self, df):
        meta = pd.DataFrame({
            "colname"       : [i for i in df.columns],
            "count"         : [len(df[i]) for i in df.columns],
            "null_count"    : [df[i].isnull().sum() for i in df.columns],
            "data_type"     : [infer_dtype(df[i]) for i in df.columns]
        })
        return meta
    
    def import_data(self):
        print("Importing data")
        self.data = {}
        for year in np.arange(15, 22):
            self.data[f"df_{str(year)}"] = {"df": pd.read_csv(f"players_{str(year)}.csv")}
            self.data[f"df_{str(year)}"]["df"]["year"] = int("20" + str(year))
        print("Successfully imported data\n")
    
    def data_quality_fix(self):
        print("Applying data quality fixes")
        for year in np.arange(15, 22):
            df = self.data[f"df_{str(year)}"]["df"]
            try:
                main_df = pd.concat([main_df, df[main_df.columns]], axis = 0).reset_index(drop=True)
            except:
                main_df = df.copy()
        group_cols  = ["long_name","short_name","player_url","dob","real_face"]
        checking_df = main_df[self.sysdata + self.constants]
        master      = checking_df.groupby(["sofifa_id"])[group_cols].agg("max")[group_cols].drop_duplicates().reset_index(drop=False)
        master_df   = pd.merge(left = main_df.drop(group_cols, axis = 1), right = master,
                               left_on = "sofifa_id", right_on = "sofifa_id", how = "left")
        self.master_df = master_df.copy()[~pd.isnull(master_df["club_name"])]
        for colname in ["short_name", "long_name", "club_name", "league_name", "loaned_from"]:
            self.master_df[colname] = self.master_df[colname].apply(lambda x: x.replace("'", ""))
        print("Successfully applied data quality fixes\n")

    def apply_tags(self):
        print("Applying transformation to 'player_tags'")
        all_tags = []
        logic = lambda x: [k.replace("#", "").strip().replace(" ", "_") for k in str(x).split(",")]
        for i in self.master_df["player_tags"]:
            if not pd.isnull(i):
                all_tags.extend(logic(i))
        all_tags = list(set(all_tags))
        self.master_df["player_tags"] = self.master_df["player_tags"].apply(lambda x: logic(x))
        for tagname in all_tags:
            self.master_df[f"tag_{tagname}"] = self.master_df["player_tags"].apply(lambda x: 1 if tagname in x else 0)
        self.master_df = self.master_df.drop(["player_tags"], axis = 1)
        print("Successfully applied transformation to 'player_tags'\n")

    def apply_traits(self):
        print("Applying transformation to 'player_traits'")
        all_traits = []
        logic = lambda x: [k.replace("(AI)", "").strip().replace(" ","_").replace("-","") for k in str(x).split(",")]
        for i in self.master_df["player_traits"]:
            if not pd.isnull(i):
                all_traits.extend(logic(i))
        all_traits = list(set(all_traits))
        self.master_df["player_traits"] = self.master_df["player_traits"].apply(lambda x: logic(x))
        for traitname in all_traits:
            self.master_df[f"trait_{traitname}"] = self.master_df["player_traits"].apply(lambda x: 1 if traitname in x else 0)
        self.master_df = self.master_df.drop(["player_traits"], axis = 1)
        print("Successfully applied transformation to 'player_traits'\n")

    def replace_nulls(self):
        print("Replacing null values")
        null_replacements = {
            "nation_position" : "NA",
            "nation_jersey_number" : "NA",
            "loaned_from" : "NA"
        }
        for k, v in list(null_replacements.items()):
            self.master_df[k] = self.master_df[k].apply(lambda x: v if pd.isnull(x) else x)
        self.master_df["has_release_clause"] = self.master_df["release_clause_eur"].apply(lambda x: "No" if pd.isnull(x) else "Yes")
        print("Successfully replaced null values\n")

    def create_sql_statement(self):
        statement = ""
        type_refs = {"integer" : "INTEGER", "floating": "FLOAT", "string": "TEXT", "mixed": "TEXT"}
        generate_sql_statement = lambda x: f"{x['colname']} {type_refs[x['data_type']]}"
        meta = self.get_metadata(self.master_df)
        meta["sql_statement"] = meta.apply(lambda x: generate_sql_statement(x), axis = 1)
        statement += f"CREATE TABLE IF NOT EXISTS {self.table_name} ("
        for eachrow in range(len(meta)):
            statement += f"{meta.loc[eachrow, 'sql_statement']}, "
        statement = statement.strip()[:-1].strip()
        statement += ");"
        return statement

    def insert_sql_statement(self, x):
        statement = f"INSERT INTO {self.table_name} VALUES ("
        for i in x:
            statement += f"'{i}', "
        statement = statement.strip()[:-1].strip() + ");"
        return statement
    
    def upload_to_database(self):
        self.failed_attempts = []
        connection = sqlite3.connect(self.database_name)
        cursor = connection.cursor()
        if self.from_scratch == True:
            print("Dropping table")
            cursor.execute(f"DROP TABLE IF EXISTS {self.table_name};")
            print("Successfully dropped table\n")
        print("Creating table")
        cursor.execute(self.create_sql_statement())
        print("Successfully created table\n")
        print("Inserting data into table")
        for i in tqdm(range(len(self.master_df))):
            try:
                cursor.execute(self.insert_sql_statement(self.master_df.iloc[i, :]))
                connection.commit()
            except:
                self.failed_attempts.append(self.master_df.iloc[i, :])
        print("Successfully inserted data into table\n")
        connection.commit()
        connection.close()

In [158]:
FIFA = FIFA_Processing(
    database_name = "fifa.db",
    table_name = "players",
    process_from_scratch = True
)

Importing data
Successfully imported data

Applying data quality fixes
Successfully applied data quality fixes

Applying transformation to 'player_tags'
Successfully applied transformation to 'player_tags'

Applying transformation to 'player_traits'
Successfully applied transformation to 'player_traits'

Replacing null values
Successfully replaced null values

Dropping table
Successfully dropped table

Creating table
Successfully created table

Inserting data into table


100%|██████████| 121272/121272 [00:48<00:00, 2484.73it/s]

Successfully inserted data into table






In [161]:
for i in FIFA.failed_attempts:
    print(i)


sofifa_id                                                                     199043
age                                                                               31
height_cm                                                                        188
weight_kg                                                                         87
nationality                                                                    Chile
club_name                                                       Universidad de Chile
league_name                                              Chilian Campeonato Nacional
league_rank                                                                      1.0
overall                                                                           69
potential                                                                         69
value_eur                                                                     850000
wage_eur                                                         