# 02 — Feature Engineering: Bruno & Casemiro Midfield Matches

This notebook loads the cleaned match-level dataset
`../data/midfield_matches_clean.csv` and creates engineered features:

- Per-90 metrics for Bruno and Casemiro
- Combined progression and defensive metrics
- Player-level average profiles (for radar plots later)

Outputs:

- `../data/midfield_matches_features.csv`  (match-level features)
- `../data/player_summary_per90.csv`       (2-row table: Bruno vs Casemiro)


In [13]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")

clean_path = DATA_DIR / "midfield_matches_clean.csv"
df = pd.read_csv(clean_path)

df.shape, df.head(3)


((9, 17289),
       Date            Comp        Round Venue Result           Squad Opponent  \
 0  8/17/25  Premier League  Matchweek 1  Home  L 0–1  Manchester Utd  Arsenal   
 1  8/24/25  Premier League  Matchweek 2  Away  D 1–1  Manchester Utd   Fulham   
 2  8/30/25  Premier League  Matchweek 3  Home  W 3–2  Manchester Utd  Burnley   
 
    Bruno_Min  Casemiro_Min  Bruno_CrdY  ...  Casemiro_Tkl.60  Casemiro_Tkl.61  \
 0         90          64.0           0  ...              3.0              3.0   
 1         90          52.0           0  ...              2.0              1.0   
 2         90          71.0           1  ...              1.0              0.0   
 
    Casemiro_Tkl.62  Casemiro_Tkl.63  Casemiro_Tkl%  Casemiro_Pass  \
 0              3.0              3.0           75.0            2.0   
 1              2.0              1.0          100.0            0.0   
 2              1.0              0.0            NaN            1.0   
 
    Casemiro_Tkl+Int  Casemiro_Clr  Casemiro_

In [14]:
# --- Keep only the columns we actually need ---

base_cols = [
    # Match metadata
    "Date", "Comp", "Round", "Venue", "Result", "Squad", "Opponent",
    
    # Minutes
    "Bruno_Min", "Casemiro_Min",

    # Bruno core stats (from Summary + other tables)
    "Gls_BrunoRow", "Ast_BrunoRow", "xG_BrunoRow", "xAG_BrunoRow",
    "Bruno_KP", "Bruno_PrgP", "Bruno_SCA", "Bruno_GCA",
    "Bruno_Tkl+Int", "Bruno_PrgC",

    # Casemiro core stats
    "Gls_CasemiroRow", "xG_CasemiroRow",
    "Casemiro_KP", "Casemiro_PrgP",
    "Casemiro_Tkl", "Casemiro_Int", "Casemiro_Tkl+Int",
    "Casemiro_PrgC",
]

# Only keep columns that actually exist (in case any name differs)
base_cols = [c for c in base_cols if c in df.columns]

df = df[base_cols].copy()
df.shape, df.columns.tolist()


((9, 21),
 ['Date',
  'Comp',
  'Round',
  'Venue',
  'Result',
  'Squad',
  'Opponent',
  'Bruno_Min',
  'Casemiro_Min',
  'Bruno_KP',
  'Bruno_PrgP',
  'Bruno_SCA',
  'Bruno_GCA',
  'Bruno_Tkl+Int',
  'Bruno_PrgC',
  'Casemiro_KP',
  'Casemiro_PrgP',
  'Casemiro_Tkl',
  'Casemiro_Int',
  'Casemiro_Tkl+Int',
  'Casemiro_PrgC'])

In [15]:
# See the meta + minute + a sample of Bruno/Casemiro columns
[col for col in df.columns if "Min" in col]
[col for col in df.columns if "Bruno_" in col][:30]
[col for col in df.columns if "Casemiro_" in col][:30]
[col for col in df.columns if "Gls" in col or "Ast" in col or "xG" in col][:30]


[]

In [16]:
BRUNO_MIN_COL = "Bruno_Min"
CASE_MIN_COL = "Casemiro_Min"

df[BRUNO_MIN_COL] = pd.to_numeric(df[BRUNO_MIN_COL], errors="coerce")
df[CASE_MIN_COL] = pd.to_numeric(df[CASE_MIN_COL], errors="coerce")

df[[BRUNO_MIN_COL, CASE_MIN_COL]].describe()


Unnamed: 0,Bruno_Min,Casemiro_Min
count,9.0,8.0
mean,88.888889,66.5
std,2.260777,15.491933
min,84.0,44.0
25%,90.0,56.5
50%,90.0,66.5
75%,90.0,74.25
max,90.0,90.0


In [17]:
def add_per90(
    frame: pd.DataFrame,
    stat_col: str,
    minutes_col: str,
    new_col: str,
) -> None:
    """
    Add a per-90 column to `frame` if `stat_col` exists.
    Assumes minutes are already numeric.
    """
    if stat_col not in frame.columns:
        # silently skip missing columns
        return
    
    # Avoid division by zero
    valid = frame[minutes_col] > 0
    frame.loc[valid, new_col] = frame.loc[valid, stat_col] * (90 / frame.loc[valid, minutes_col])


In [18]:
# Mapping: {stat_col_in_df: new_per90_col_name}
BRUNO_PER90_CONFIG = {
    # Attack output
    "Gls_BrunoRow":      "Bruno_gls_per90",
    "Ast_BrunoRow":      "Bruno_ast_per90",
    "xG_BrunoRow":       "Bruno_xg_per90",
    "xAG_BrunoRow":      "Bruno_xag_per90",

    # Creativity / progression
    "Bruno_KP":          "Bruno_kp_per90",
    "Bruno_PrgP":        "Bruno_prgp_per90",
    "Bruno_SCA":         "Bruno_sca_per90",
    "Bruno_GCA":         "Bruno_gca_per90",

    # Defensive + ball carrying
    "Bruno_Tkl+Int":     "Bruno_tklint_per90",
    "Bruno_PrgC":        "Bruno_prgc_per90",
}

for stat_col, new_col in BRUNO_PER90_CONFIG.items():
    add_per90(df, stat_col, BRUNO_MIN_COL, new_col)

sorted([c for c in df.columns if c.endswith("_per90") and c.startswith("Bruno_")])


['Bruno_gca_per90',
 'Bruno_kp_per90',
 'Bruno_prgc_per90',
 'Bruno_prgp_per90',
 'Bruno_sca_per90',
 'Bruno_tklint_per90']

In [19]:
CASE_PER90_CONFIG = {
    # Attack output
    "Gls_CasemiroRow":   "Casemiro_gls_per90",
    "xG_CasemiroRow":    "Casemiro_xg_per90",

    # Progression / passing
    "Casemiro_KP":       "Casemiro_kp_per90",
    "Casemiro_PrgP":     "Casemiro_prgp_per90",

    # Defensive work
    "Casemiro_Tkl":      "Casemiro_tkl_per90",
    "Casemiro_Int":      "Casemiro_int_per90",
    "Casemiro_Tkl+Int":  "Casemiro_tklint_per90",

    # Carrying
    "Casemiro_PrgC":     "Casemiro_prgc_per90",
}

for stat_col, new_col in CASE_PER90_CONFIG.items():
    add_per90(df, stat_col, CASE_MIN_COL, new_col)

sorted([c for c in df.columns if c.endswith("_per90") and c.startswith("Casemiro_")])


['Casemiro_int_per90',
 'Casemiro_kp_per90',
 'Casemiro_prgc_per90',
 'Casemiro_prgp_per90',
 'Casemiro_tkl_per90',
 'Casemiro_tklint_per90']

In [20]:
# Combined attacking threat per 90
df["Bruno_xg_xag_per90"] = df.get("Bruno_xg_per90", 0) + df.get("Bruno_xag_per90", 0)

# Combined progression per 90
df["Bruno_prog_total_per90"] = df.get("Bruno_prgp_per90", 0) + df.get("Bruno_prgc_per90", 0)
df["Casemiro_prog_total_per90"] = df.get("Casemiro_prgp_per90", 0) + df.get("Casemiro_prgc_per90", 0)

# Combined defensive intensity (tackles + interceptions)
df["Bruno_case_tklint_per90"] = df.get("Bruno_tklint_per90", 0) + df.get("Casemiro_tklint_per90", 0)

# Share of progression done by Bruno vs Casemiro
total_prog = df["Bruno_prog_total_per90"] + df["Casemiro_prog_total_per90"]
df["Bruno_prog_share"] = df["Bruno_prog_total_per90"] / total_prog.replace(0, pd.NA)
df["Casemiro_prog_share"] = df["Casemiro_prog_total_per90"] / total_prog.replace(0, pd.NA)

df[[c for c in df.columns if "prog" in c and "per90" in c or "share" in c]].head()


Unnamed: 0,Bruno_prog_total_per90,Casemiro_prog_total_per90,Bruno_prog_share,Casemiro_prog_share
0,12.0,5.625,0.680851,0.319149
1,6.0,1.730769,0.776119,0.223881
2,10.0,5.070423,0.663551,0.336449
3,7.325581,8.181818,0.472393,0.527607
4,10.0,7.5,0.571429,0.428571


In [21]:
important_cols = [
    # meta
    "Date", "Comp", "Round", "Venue", "Result", "Opponent",
    # minutes
    "Bruno_Min", "Casemiro_Min",
    # Bruno main per-90s
    "Bruno_gls_per90", "Bruno_ast_per90", "Bruno_xg_per90", "Bruno_xag_per90",
    "Bruno_kp_per90", "Bruno_prgp_per90", "Bruno_prgc_per90",
    # Casemiro per-90s
    "Casemiro_gls_per90", "Casemiro_xg_per90",
    "Casemiro_tkl_per90", "Casemiro_int_per90", "Casemiro_tklint_per90",
    "Casemiro_prgp_per90", "Casemiro_prgc_per90",
    # combined
    "Bruno_prog_share", "Casemiro_prog_share",
    "Bruno_case_tklint_per90",
]

# Only keep those that actually exist
important_cols = [c for c in important_cols if c in df.columns]

df[important_cols].round(2)


Unnamed: 0,Date,Comp,Round,Venue,Result,Opponent,Bruno_Min,Casemiro_Min,Bruno_kp_per90,Bruno_prgp_per90,Bruno_prgc_per90,Casemiro_tkl_per90,Casemiro_int_per90,Casemiro_tklint_per90,Casemiro_prgp_per90,Casemiro_prgc_per90,Bruno_prog_share,Casemiro_prog_share,Bruno_case_tklint_per90
0,8/17/25,Premier League,Matchweek 1,Home,L 0–1,Arsenal,90,64.0,5.0,10.0,2.0,4.22,0.0,4.22,5.62,0.0,0.68,0.32,7.22
1,8/24/25,Premier League,Matchweek 2,Away,D 1–1,Fulham,90,52.0,1.0,5.0,1.0,3.46,1.73,5.19,1.73,0.0,0.78,0.22,7.19
2,8/30/25,Premier League,Matchweek 3,Home,W 3–2,Burnley,90,71.0,6.0,7.0,3.0,1.27,1.27,2.54,5.07,0.0,0.66,0.34,5.54
3,9/20/25,Premier League,Matchweek 5,Home,W 2–1,Chelsea,86,44.0,1.05,7.33,0.0,0.0,0.0,0.0,8.18,0.0,0.47,0.53,2.09
4,10/4/25,Premier League,Matchweek 7,Home,W 2–0,Sunderland,90,84.0,3.0,8.0,2.0,3.21,0.0,3.21,7.5,0.0,0.57,0.43,5.21
5,10/19/25,Premier League,Matchweek 8,Away,W 2–1,Liverpool,84,58.0,3.21,3.21,0.0,4.66,0.0,4.66,1.55,0.0,0.67,0.33,5.73
6,10/25/25,Premier League,Matchweek 9,Home,W 4–2,Brighton,90,69.0,2.0,8.0,2.0,3.91,2.61,6.52,1.3,0.0,0.88,0.12,11.52
7,11/1/25,Premier League,Matchweek 10,Away,D 2–2,Nott'ham Forest,90,90.0,4.0,8.0,1.0,3.0,1.0,4.0,5.0,1.0,0.6,0.4,6.0
8,11/8/25,Premier League,Matchweek 11,Away,D 2–2,Tottenham,90,,1.0,4.0,3.0,,,,,,,,


In [22]:
# Select per-90 columns for each player
bruno_per90_cols = [c for c in df.columns if c.startswith("Bruno_") and c.endswith("_per90")]
case_per90_cols  = [c for c in df.columns if c.startswith("Casemiro_") and c.endswith("_per90")]

bruno_summary = (
    df[bruno_per90_cols]
    .mean()
    .to_frame(name="value")
    .reset_index()
    .rename(columns={"index": "stat"})
)
bruno_summary["player"] = "Bruno Fernandes"

case_summary = (
    df[case_per90_cols]
    .mean()
    .to_frame(name="value")
    .reset_index()
    .rename(columns={"index": "stat"})
)
case_summary["player"] = "Casemiro"

player_summary = pd.concat([bruno_summary, case_summary], ignore_index=True)

player_summary.head(20)


Unnamed: 0,stat,value,player
0,Bruno_kp_per90,2.917866,Bruno Fernandes
1,Bruno_prgp_per90,6.726652,Bruno Fernandes
2,Bruno_sca_per90,6.401624,Bruno Fernandes
3,Bruno_gca_per90,0.68254,Bruno Fernandes
4,Bruno_tklint_per90,2.462717,Bruno Fernandes
5,Bruno_prgc_per90,1.555556,Bruno Fernandes
6,Bruno_xg_xag_per90,0.0,Bruno Fernandes
7,Bruno_prog_total_per90,8.282207,Bruno Fernandes
8,Bruno_case_tklint_per90,6.31274,Bruno Fernandes
9,Casemiro_kp_per90,0.969507,Casemiro


In [23]:
features_path = DATA_DIR / "midfield_matches_features.csv"
summary_path = DATA_DIR / "player_summary_per90.csv"

df.to_csv(features_path, index=False)
player_summary.to_csv(summary_path, index=False)

features_path, summary_path


(PosixPath('../data/midfield_matches_features.csv'),
 PosixPath('../data/player_summary_per90.csv'))