In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore") # who likes warnings right?

C:\Users\felip\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\felip\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
def clean_data(filename):
    """@args [filename] raw csv aggregated on player per team per match, to be converted on team per match"""
    """@returns clean df with relevant features"""
        
    df = pd.read_csv(filename)
    df = df.fillna(0) # replace "empty" cells with 0
    
    year = int(filename.split("_")[1].split(".csv")[0]) # gets just "2018" or "2014"
    if year == 2018:
        df["A_H"] = df["A_Z4"] + df["A_Z5"] # 2018 data divides intensity into 5 groups instead of 3 like 2014 and 2010
        df["A_L"] = df["A_Z1"] + df["A_Z2"]
    
    relevant_features = ["MATCH", "TEAM", "PC ", "PA ", "PC/PA", "GS", "S", "SG", "FC", "FS", "DC", "D_IN_POSS",
                         "Offsides", "D_NOT_IN_POSS", "Yellow_cards", "Red_card", "A_H", "A_L", "SPRINT", "T_OPP_HALF",
                         "T_ATT_3RD"]
    df = df[relevant_features]
    
    # Fixing some minor formatting
    df["DC"] = df["DC"] / 1000 # distance from m to km
    df["D_IN_POSS"] = df["D_IN_POSS"] / 1000
    df["D_NOT_IN_POSS"] = df["D_NOT_IN_POSS"] / 1000
    
    # How to aggregate the features (mostly sum value for each player)
    agg_method = {feat:"sum" for feat in relevant_features}
    
    # Fix features that need the mean value of all players, not the sum of each player
    agg_method["PC/PA"] = "mean" # pass accuracy
    agg_method["A_H"] = "mean" # time in high intensity (%)
    agg_method["A_L"] = "mean" # time in high intensity (%)
    agg_method["T_OPP_HALF"] = "mean" # time spent in opponent half (%)
    agg_method["T_ATT_3RD"] = "mean" # time spent in opponent third (near box) (%)
    
    # Index values just need the first because they're all the same
    agg_method["MATCH"] = "first"
    agg_method["TEAM"] = "first"
    
    # Perform aggregation
    df = df.groupby(["MATCH", "TEAM"]).aggregate(agg_method)
    
    # Create new features and fix values
    df["goals_against"] = 0 # to be filled inside the loop below
    df["WDL"] = 0 # first assume all games tied. fixed in the loop below if needed

    for i in range(1, 65): # for each of the 64 matches
        df.loc[i, :]["goals_against"][0] = df.loc[i, :]["GS"][1] # team_0 suffers what team_2 scores
        df.loc[i, :]["goals_against"][1] = df.loc[i, :]["GS"][0] 
        if df.loc[i, :]["GS"][0] > df.loc[i, :]["GS"][1]: # if team_1 scored more than team_2
            df.loc[i, :]["WDL"][0] = 1
            df.loc[i, :]["WDL"][1] = -1
        elif df.loc[i, :]["GS"][0] < df.loc[i, :]["GS"][1]:
            df.loc[i, :]["WDL"][0] = -1
            df.loc[i, :]["WDL"][1] = 1

    df = df.rename(columns={
        "MATCH":"match",
        "TEAM":"team",
        "PC ":"passes completed",
        "PA ":"total passes",
        "PC/PA":"passes acc",
        "GS":"goals for",
        "S":"total shots",
        "SG":"on-target",
        "FC":"fouls committed",
        "FS":"fouls suffered",
        "DC":"distance",
        "D_IN_POSS":"distance poss",
        "D_NOT_IN_POSS":"distance not poss",
        "Offsides":"offsides",
        "Yellow_cards":"yellow",
        "Red_card":"red",
        "A_H":"high intensity",
        "A_L":"low intensity",
        "SPRINT":"sprints",
        "T_OPP_HALF":"time opp half",
        "T_ATT_3RD":"time opp third"
    }) # missing corners and posession, need to get those manually!
    
    df["year"] = year

    return df

In [3]:
games_18 = clean_data("raw_2018.csv")
games_14 = clean_data("raw_2014.csv")
games_10 = clean_data("raw_2010.csv")