# Eploration 


### Create `columns.json`

Create a json file that contains all the column names


In [28]:
import pandas as pd
import os
import json

with open("columns.json", "w") as f:
    f.truncate(0)
    f.write("{}")

for folder in os.listdir("FIFA Dataset/data"):
    for file in os.listdir("FIFA Dataset/data/" + folder):
        if file.endswith(".csv"):
            with open("columns.json", "r") as f:
                data = json.load(f)
        try:
            if file.endswith(".csv"):
                df = pd.read_csv("FIFA Dataset/data/" + folder + "/" + file)
                data[f"{folder}/{file}"] = list(df.columns)
                with open("columns.json", "a") as f:
                    f.truncate(0)
                    json.dump(data, f, indent=4)
        except Exception as e:
            print(f"error in {folder}/{file}")
            print(e)


df_tweets_01 = pd.read_csv(
    "FIFA Dataset/data/FIFA World Cup 2022 Twitter Dataset/tweets1.csv", delimiter=";"
)
df_tweets_02 = pd.read_csv(
    "FIFA Dataset/data/FIFA World Cup 2022 Twitter Dataset/tweets2.csv", delimiter=";"
)
df_tweets = pd.concat([df_tweets_01, df_tweets_02])
with open("columns.json", "r") as f:
    data = json.load(f)

data["FIFA World Cup 2022 Twitter Dataset/tweets.csv"] = list(df_tweets.columns)
with open("columns.json", "a") as f:
    f.truncate(0)
    json.dump(data, f, indent=4)

error in FIFA World Cup 2022 Twitter Dataset/tweets1.csv
Error tokenizing data. C error: Expected 1 fields in line 5, saw 2

error in FIFA World Cup 2022 Twitter Dataset/tweets2.csv
Error tokenizing data. C error: Expected 1 fields in line 12, saw 2



### Merge all field player data together into one dataframe

In [53]:

import pandas as pd
import os
import difflib 
# create an empty dataframe to store the final result
df_final = pd.read_csv(
    "FIFA Dataset/data/FIFA World Cup 2022 Player Data/player_defense.csv"
)
# loop through all the csv files in the folder
for file in os.listdir("FIFA Dataset/data/FIFA World Cup 2022 Player Data"):
    if file.endswith(".csv") and "keepers" not in file:
        # read the csv file
        df = pd.read_csv("FIFA Dataset/data/FIFA World Cup 2022 Player Data/" + file)
        # drop the duplicate columns
        df = df.loc[:, ~df.columns.duplicated()]
        # concatenate the dataframe to the final dataframe
        df_final = pd.merge(
            df_final, df, on="player", how="outer", suffixes=("", "_right")
        )


# drop columns that where they have the same name but end with _x and _y, only keep the _x columns
df_final = df_final.loc[:, ~df_final.columns.str.endswith("_right")]

# save
df_final.to_csv("FIFA Dataset/data/FIFA World Cup 2022 Player Data/players_joined.csv", index=False)



player_defense.csv
player_gca.csv
player_misc.csv
player_passing.csv
player_passing_types.csv
player_playingtime.csv
player_possession.csv
player_shooting.csv
player_stats.csv


x

In [None]:
def get_player_info(name):
    all_players = df_final["player"].str.lower().tolist()
    # find the closest match
    closest_match = difflib.get_close_matches(name.lower(), all_players)[0]

    player_info = df_final[df_final["player"].str.lower() == closest_match]
    # drop the player column
    player_info = player_info.drop("player", axis=1)
    # print as dictionary
    return player_info.to_dict(orient="records")[0]

### Merge all keeper data into one dataframe


In [54]:

df_final = pd.read_csv(
    "FIFA Dataset/data/FIFA World Cup 2022 Player Data/player_keepers.csv"
)
# loop through all the csv files in the folder
for file in os.listdir("FIFA Dataset/data/FIFA World Cup 2022 Player Data"):
    if file.endswith(".csv") and "keepers" in file:
        # read the csv file
        df = pd.read_csv("FIFA Dataset/data/FIFA World Cup 2022 Player Data/" + file)
        # drop the duplicate columns
        df = df.loc[:, ~df.columns.duplicated()]
        # concatenate the dataframe to the final dataframe
        df_final = pd.merge(
            df_final, df, on="player", how="outer", suffixes=("", "_right")
        )


# drop columns that where they have the same name but end with _x and _y, only keep the _x columns
df_final = df_final.loc[:, ~df_final.columns.str.endswith("_right")]

# save
df_final.to_csv(
    "FIFA Dataset/data/FIFA World Cup 2022 Player Data/keepers_joined.csv", index=False
)

player_keepers.csv
player_keepersadv.csv


### Helper function

In [55]:
def get_player_info(name, 
                    df_player = pd.read_csv("FIFA Dataset/data/FIFA World Cup 2022 Player Data/players_joined.csv"), 
                    df_keeper = pd.read_csv("FIFA Dataset/data/FIFA World Cup 2022 Player Data/keepers_joined.csv")):
    
    all_players = df_player["player"].str.lower().tolist()
    all_keepers = df_keeper["player"].str.lower().tolist()
    # players and keepers together
    all_players = all_players + all_keepers
    # find the closest match
    closest_match = difflib.get_close_matches(name.lower(), all_players)[0]

    player_info = df_final[df_final["player"].str.lower() == closest_match]
    # drop the player column
    player_info = player_info.drop("player", axis=1)
    # print as dictionary
    return player_info.to_dict(orient="records")[0]

In [56]:
get_player_info("Ederson")

{'position': 'GK',
 'team': 'Brazil',
 'age': '29-123',
 'club': 'Manchester City',
 'birth_year': 1993,
 'gk_games': 1,
 'gk_games_starts': 1,
 'gk_minutes': 90,
 'minutes_90s': 1.0,
 'gk_goals_against': 1,
 'gk_goals_against_per90': 1.0,
 'gk_shots_on_target_against': 3,
 'gk_saves': 2,
 'gk_save_pct': 66.7,
 'gk_wins': 0,
 'gk_ties': 0,
 'gk_losses': 1,
 'gk_clean_sheets': 0,
 'gk_clean_sheets_pct': 0.0,
 'gk_pens_att': 0,
 'gk_pens_allowed': 0,
 'gk_pens_saved': 0,
 'gk_pens_missed': 0,
 'gk_pens_save_pct': nan,
 'gk_free_kick_goals_against': 0,
 'gk_corner_kick_goals_against': 0,
 'gk_own_goals_against': 0,
 'gk_psxg': 1.2,
 'gk_psnpxg_per_shot_on_target_against': 0.39,
 'gk_psxg_net': 0.2,
 'gk_psxg_net_per90': 0.18,
 'gk_passes_completed_launched': 3,
 'gk_passes_launched': 4,
 'gk_passes_pct_launched': 75.0,
 'gk_passes': 25,
 'gk_passes_throws': 10,
 'gk_pct_passes_launched': 12.0,
 'gk_passes_length_avg': 25.8,
 'gk_goal_kicks': 6,
 'gk_pct_goal_kicks_launched': 16.7,
 'gk_go