# Data Preprocessing

This notebook prepares the match-event dataset for training a machine learning model. The preprocessing steps clean the raw data, remove errors or leaks, and create structured features that can be used by the MLP model in the training notebook.

What This Notebook Does

- Loads the raw match-event CSV file
- Removes invalid or missing entries
- Handles NaN and infinite values
- Removes label leakage (columns that reveal the match result)
- Selects only useful numeric features

Creates engineered match-level features such as:
 - Attack strength
 - Defensive actions
 - Passing and shooting efficiency
 - Possession quality

Saves the cleaned and processed dataset for modeling

In [1]:
# Imports 
import pandas as pd
import numpy as np
import json
from ast import literal_eval

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

In [2]:
DATA_DIR = "/Users/hetavvyas/Downloads/Project_Data"

events_eng = pd.read_csv(f"{DATA_DIR}/events_England.csv")
matches_eng = pd.read_csv(f"{DATA_DIR}/matches_England.csv")
teams = pd.read_csv(f"{DATA_DIR}/teams.csv")

print("Events shape:", events_eng.shape)
print("Matches shape:", matches_eng.shape)
print("Teams shape:", teams.shape)

events_eng.head()


Events shape: (643090, 17)
Matches shape: (380, 38)
Teams shape: (142, 6)


Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,tagsList,pos_orig_y,pos_orig_x,pos_dest_y,pos_dest_x
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85.0,177959171,[1801],49,49,78,31
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.94685,83.0,177959172,[1801],78,31,75,51
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82.0,177959173,[1801],75,51,71,35
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82.0,177959174,[1801],71,35,95,41
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85.0,177959175,[1801],95,41,88,72


In [3]:
# Basic Cleaning: Missing Values & Duplicates

# Drop duplicate rows
events_eng = events_eng.drop_duplicates()
matches_eng = matches_eng.drop_duplicates()

print("Events NA counts:\n", events_eng.isna().sum().head(20))
print("\nMatches NA counts:\n", matches_eng.isna().sum().head(20))


if "subEventName" in events_eng.columns:
    events_eng["subEventName"] = events_eng["subEventName"].fillna("Unknown")


Events NA counts:
 eventId             0
subEventName     1558
tags                0
playerId            0
positions           0
matchId             0
eventName           0
teamId              0
matchPeriod         0
eventSec            0
subEventId       1558
id                  0
tagsList        61475
pos_orig_y          0
pos_orig_x          0
pos_dest_y          0
pos_dest_x          0
dtype: int64

Matches NA counts:
 status           0
roundId          0
gameweek         0
teamsData        0
seasonId         0
dateutc          0
winner           0
venue            0
wyId             0
label            0
date             0
referees         0
duration         0
competitionId    0
team1.scoreET    0
team1.coachId    0
team1.side       0
team1.teamId     0
team1.score      0
team1.scoreP     0
dtype: int64


In [4]:
# Parse 'tags' into a usable Python list

import json
from ast import literal_eval
import numpy as np
import pandas as pd

def parse_list_column(val):
    """Safely parse tags column into a list."""
    if isinstance(val, list):
        return val
    if pd.isna(val):
        return []
    try:
        return literal_eval(val)
    except Exception:
        try:
            return json.loads(val.replace("'", '"'))
        except Exception:
            return []

events_eng["tagsList"] = events_eng["tags"].apply(parse_list_column)


In [5]:
# Create Event Flags for Key Actions

# Core event types
events_eng["is_pass"] = events_eng["eventName"] == "Pass"
events_eng["is_shot"] = events_eng["eventName"] == "Shot"
events_eng["is_foul"] = events_eng["eventName"] == "Foul"

# Defensive actions 
events_eng["is_tackle"] = events_eng["subEventName"] == "Ground defending duel"
events_eng["is_clearance"] = events_eng["subEventName"] == "Clearance"

SUCCESS_TAG = 1801
SHOT_ON_TARGET_TAGS = {101, 102, 103, 1801}  # typical Wyscout SOT/success tags

def has_tag(tags, tag_id):
    if not isinstance(tags, list):
        return False
    for x in tags:
        if isinstance(x, dict) and x.get("id") == tag_id:
            return True
        if x == tag_id:
            return True
    return False

def has_any_tag(tags, tag_ids):
    if not isinstance(tags, list):
        return False
    for x in tags:
        if isinstance(x, dict):
            if x.get("id") in tag_ids:
                return True
        else:
            if x in tag_ids:
                return True
    return False

# Generic success flag
events_eng["success"] = events_eng["tagsList"].apply(lambda tags: has_tag(tags, SUCCESS_TAG))

# Successful passes only
events_eng["is_successful_pass"] = events_eng["is_pass"] & events_eng["success"]

# Shots on target
events_eng["shot_on_target"] = events_eng["is_shot"] & events_eng["tagsList"].apply(
    lambda tags: has_any_tag(tags, SHOT_ON_TARGET_TAGS)
)


In [6]:
# Aggregate to Teamâ€“Match Level
# one row per (matchId, teamId)

missing_cols = [c for c in ["matchId", "teamId"] if c not in events_eng.columns]
if missing_cols:
    print("Missing ID columns in events:", missing_cols)

agg_dict = {
    "is_pass": "sum",
    "is_shot": "sum",
    "shot_on_target": "sum",
    "is_foul": "sum",
    "is_tackle": "sum",
    "is_clearance": "sum",
    "success": "sum",
    "is_successful_pass": "sum",
    "eventId": "count" if "eventId" in events_eng.columns else "size"
}

agg_dict_filtered = {
    col: func for col, func in agg_dict.items()
    if col in events_eng.columns
}

team_match_features = events_eng.groupby(["matchId", "teamId"]).agg(agg_dict_filtered)

team_match_features.columns = [
    "_".join(col) if isinstance(col, tuple) else col
    for col in team_match_features.columns
]

team_match_features = team_match_features.reset_index()

team_match_features.head()


Unnamed: 0,matchId,teamId,is_pass,is_shot,shot_on_target,is_foul,is_tackle,is_clearance,success,is_successful_pass,eventId
0,2499719,1609,606,27,10,9,65,19,750,513,1032
1,2499719,1631,230,7,4,12,82,21,370,166,736
2,2499720,1625,754,12,3,9,44,4,842,682,1053
3,2499720,1651,184,6,2,6,61,27,259,124,553
4,2499721,1610,516,15,5,18,49,6,622,454,877


In [7]:
# Rename Aggregates & Derive Key Ratios 

rename_map = {
    "is_pass": "total_passes",
    "is_shot": "total_shots",
    "shot_on_target": "shots_on_target",
    "is_foul": "fouls_committed",
    "is_tackle": "tackles",
    "is_clearance": "clearances",
    "success": "successful_events",
    "is_successful_pass": "successful_passes",
    "eventId_count": "total_events",
    "eventId_size": "total_events"
}

for old, new in rename_map.items():
    if old in team_match_features.columns:
        team_match_features = team_match_features.rename(columns={old: new})



if "total_events" not in team_match_features.columns:
    cols = [c for c in ["total_passes", "total_shots", "fouls_committed"] if c in team_match_features.columns]
    if cols:
        team_match_features["total_events"] = team_match_features[cols].sum(axis=1)
    else:
        team_match_features["total_events"] = np.nan


# Pass success rate
if "total_passes" in team_match_features.columns and "successful_passes" in team_match_features.columns:
    team_match_features["pass_success_rate"] = (
        team_match_features["successful_passes"] /
        team_match_features["total_passes"].replace(0, np.nan)
    )



# Shot accuracy
if "total_shots" in team_match_features.columns and "shots_on_target" in team_match_features.columns:
    team_match_features["shot_accuracy"] = (
        team_match_features["shots_on_target"] /
        team_match_features["total_shots"].replace(0, np.nan)
    )


# Possession ratio
team_match_features["possession_ratio"] = (
    team_match_features["total_passes"] /
    team_match_features["total_events"].replace(0, np.nan)
)

team_match_features.head()


Unnamed: 0,matchId,teamId,total_passes,total_shots,shots_on_target,fouls_committed,tackles,clearances,successful_events,successful_passes,eventId,total_events,pass_success_rate,shot_accuracy,possession_ratio
0,2499719,1609,606,27,10,9,65,19,750,513,1032,642,0.846535,0.37037,0.943925
1,2499719,1631,230,7,4,12,82,21,370,166,736,249,0.721739,0.571429,0.923695
2,2499720,1625,754,12,3,9,44,4,842,682,1053,775,0.904509,0.25,0.972903
3,2499720,1651,184,6,2,6,61,27,259,124,553,196,0.673913,0.333333,0.938776
4,2499721,1610,516,15,5,18,49,6,622,454,877,549,0.879845,0.333333,0.939891


In [8]:
# Build Team-Level Results (Win/Draw/Loss) from matches_eng

# Keep only needed columns from matches
print(matches_eng[["wyId", "team1.teamId", "team2.teamId", "team1.score", "team2.score"]].head())

matches_small = matches_eng[[
    "wyId",
    "team1.teamId", "team2.teamId",
    "team1.score", "team2.score"
]].copy()

matches_small = matches_small.rename(columns={
    "wyId": "matchId",
    "team1.teamId": "team1_id",
    "team2.teamId": "team2_id",
    "team1.score": "team1_goals",
    "team2.score": "team2_goals"
})

# Team 1 rows
team1_df = matches_small.rename(columns={
    "team1_id": "teamId",
    "team2_id": "opponentId",
    "team1_goals": "goals_for",
    "team2_goals": "goals_against"
}).copy()
team1_df["home_away"] = "home"

# Team 2 rows
team2_df = matches_small.rename(columns={
    "team2_id": "teamId",
    "team1_id": "opponentId",
    "team2_goals": "goals_for",
    "team1_goals": "goals_against"
}).copy()
team2_df["home_away"] = "away"

match_teams = pd.concat(
    [
        team1_df[["matchId", "teamId", "opponentId", "goals_for", "goals_against", "home_away"]],
        team2_df[["matchId", "teamId", "opponentId", "goals_for", "goals_against", "home_away"]]
    ],
    ignore_index=True
)

def get_result(row):
    if row["goals_for"] > row["goals_against"]:
        return "Win"
    elif row["goals_for"] < row["goals_against"]:
        return "Loss"
    else:
        return "Draw"

match_teams["result"] = match_teams.apply(get_result, axis=1)

result_map = {"Loss": 0, "Draw": 1, "Win": 2}
match_teams["result_label"] = match_teams["result"].map(result_map)

match_teams.head()


      wyId  team1.teamId  team2.teamId  team1.score  team2.score
0  2500089          1646          1659            1            2
1  2500090          1628          1627            2            0
2  2500091          1609          1673            1            0
3  2500092          1651          1612            0            4
4  2500093          1644          1611            0            1


Unnamed: 0,matchId,teamId,opponentId,goals_for,goals_against,home_away,result,result_label
0,2500089,1646,1659,1,2,home,Loss,0
1,2500090,1628,1627,2,0,home,Win,2
2,2500091,1609,1673,1,0,home,Win,2
3,2500092,1651,1612,0,4,home,Loss,0
4,2500093,1644,1611,0,1,home,Loss,0


In [9]:
# Merge Features with Results + Team Names

match_features = pd.merge(
    team_match_features,
    match_teams,
    on=["matchId", "teamId"],
    how="inner"
)

if "wyId" in teams.columns and "name" in teams.columns:
    team_name_map = teams.set_index("wyId")["name"].to_dict()
    match_features["team_name"] = match_features["teamId"].map(team_name_map)

result_cols = ["result", "result_label"]
other_cols = [c for c in match_features.columns if c not in result_cols]

match_features = match_features[other_cols + result_cols]

match_features.head()


Unnamed: 0,matchId,teamId,total_passes,total_shots,shots_on_target,fouls_committed,tackles,clearances,successful_events,successful_passes,eventId,total_events,pass_success_rate,shot_accuracy,possession_ratio,opponentId,goals_for,goals_against,home_away,team_name,result,result_label
0,2499719,1609,606,27,10,9,65,19,750,513,1032,642,0.846535,0.37037,0.943925,1631,4,3,home,Arsenal,Win,2
1,2499719,1631,230,7,4,12,82,21,370,166,736,249,0.721739,0.571429,0.923695,1609,3,4,away,Leicester City,Loss,0
2,2499720,1625,754,12,3,9,44,4,842,682,1053,775,0.904509,0.25,0.972903,1651,2,0,away,Manchester City,Win,2
3,2499720,1651,184,6,2,6,61,27,259,124,553,196,0.673913,0.333333,0.938776,1625,0,2,home,Brighton & Hove Albion,Loss,0
4,2499721,1610,516,15,5,18,49,6,622,454,877,549,0.879845,0.333333,0.939891,1646,2,3,away,Chelsea,Loss,0


In [10]:
# Round key ratio features
round_cols = ["pass_success_rate", "shot_accuracy", "possession_ratio"]

for col in round_cols:
    if col in match_features.columns:
        match_features[col] = match_features[col].round(4)   # 4 decimals


In [11]:
# Save Final Match-Level Feature Table

output_path = f"{DATA_DIR}/match_features_England_basic.csv"
match_features.to_csv(output_path, index=False)

print("Saved match-level features to:", output_path)



Saved match-level features to: /Users/hetavvyas/Downloads/Project_Data/match_features_England_basic.csv
