# CS 171 Final Project — Data Pre-processing Notebook

## **Goal for Today**
- Build the initial **clean match-level dataset** from the Soccer Match Event files.

## **What I Worked On**
- Loaded key CSV files:
  - `events`
  - `matches`
  - `games`
  - `teams`
  - `players`
- Checked missing values and removed duplicates.
- Merged **event-level data** with **match metadata**.
- Created basic match-level features:
  - total events  
  - number of passes  
  - number of shots  
  - number of fouls  
- Extracted **match result** (Win / Draw / Loss).
- Saved the first simple feature table.

## **Dataset Used**
- **Soccer Match Event Dataset (Kaggle)**
  - Contains detailed in-game event logs and match information across multiple leagues.
  - Includes actions, matches, competitions, players, teams, and event tags.

In [1]:
import pandas as pd
import numpy as np
import os

DATA_DIR = "/Users/hetavvyas/Downloads/Project_Data"

print("Files in folder:", os.listdir(DATA_DIR)[:10])

Files in folder: ['matches_Germany.csv', 'matches_Spain.csv', 'events_Spain.csv', 'labels.csv', 'competitions.csv', '.DS_Store', 'teams.csv', 'referees.csv', 'matches_European_Championship.csv', 'coaches.csv']


In [2]:
events_eng = pd.read_csv(os.path.join(DATA_DIR, "events_England.csv"))
matches_eng = pd.read_csv(os.path.join(DATA_DIR, "matches_England.csv"))
games = pd.read_csv(os.path.join(DATA_DIR, "games.csv"))
teams = pd.read_csv(os.path.join(DATA_DIR, "teams.csv"))
players = pd.read_csv(os.path.join(DATA_DIR, "players.csv"))

print(events_eng.shape, matches_eng.shape)

(643090, 17) (380, 38)


In [3]:
# Quick check of DataFrame preview, types, and missing values
def quick_check(df, name):
    print(f"\n--- {name} ---")
    display(df.head())                               
    display(df.dtypes.to_frame("dtype"))               
    display(df.isna().sum().sort_values(ascending=False).head(10).to_frame("missing_values"))


In [4]:
from IPython.display import display

quick_check(events_eng, "events_eng")
quick_check(matches_eng, "matches_eng")


--- events_eng ---


Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,tagsList,pos_orig_y,pos_orig_x,pos_dest_y,pos_dest_x
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85.0,177959171,[1801],49,49,78,31
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.94685,83.0,177959172,[1801],78,31,75,51
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82.0,177959173,[1801],75,51,71,35
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82.0,177959174,[1801],71,35,95,41
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85.0,177959175,[1801],95,41,88,72


Unnamed: 0,dtype
eventId,int64
subEventName,object
tags,object
playerId,int64
positions,object
matchId,int64
eventName,object
teamId,int64
matchPeriod,object
eventSec,float64


Unnamed: 0,missing_values
tagsList,61475
subEventId,1558
subEventName,1558
eventId,0
eventSec,0
pos_dest_y,0
pos_orig_x,0
pos_orig_y,0
id,0
matchPeriod,0



--- matches_eng ---


Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,...,team2.side,team2.teamId,team2.score,team2.scoreP,team2.hasFormation,team2.formation,team2.scoreHT,team2.formation.bench,team2.formation.lineup,team2.formation.substitutions
0,Played,4405654,38,"{'1646': {'scoreET': 0, 'coachId': 8880, 'side...",181150,2018-05-13 14:00:00,1659,Turf Moor,2500089,"Burnley - AFC Bournemouth, 1 - 2",...,away,1659,2,0,1,"{'bench': [{'playerId': 11061, 'ownGoals': '0'...",0,"[{'playerId': 11061, 'ownGoals': '0', 'redCard...","[{'playerId': 259531, 'ownGoals': '0', 'redCar...","[{'playerIn': 7989, 'playerOut': 259531, 'minu..."
1,Played,4405654,38,"{'1628': {'scoreET': 0, 'coachId': 8357, 'side...",181150,2018-05-13 14:00:00,1628,Selhurst Park,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0",...,away,1627,0,0,1,"{'bench': [{'playerId': 399517, 'ownGoals': '0...",0,"[{'playerId': 399517, 'ownGoals': '0', 'redCar...","[{'playerId': 25537, 'ownGoals': '0', 'redCard...","[{'playerIn': 261, 'playerOut': 25537, 'minute..."
2,Played,4405654,38,"{'1609': {'scoreET': 0, 'coachId': 7845, 'side...",181150,2018-05-13 14:00:00,1609,The John Smith's Stadium,2500091,"Huddersfield Town - Arsenal, 0 - 1",...,home,1673,0,0,1,"{'bench': [{'playerId': 274482, 'ownGoals': '0...",0,"[{'playerId': 274482, 'ownGoals': '0', 'redCar...","[{'playerId': 9419, 'ownGoals': '0', 'redCards...","[{'playerIn': 38377, 'playerOut': 9419, 'minut..."
3,Played,4405654,38,"{'1651': {'scoreET': 0, 'coachId': 8093, 'side...",181150,2018-05-13 14:00:00,1612,Anfield,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0",...,home,1612,4,0,1,"{'bench': [{'playerId': 74, 'ownGoals': '0', '...",2,"[{'playerId': 74, 'ownGoals': '0', 'redCards':...","[{'playerId': 25747, 'ownGoals': '2', 'redCard...","[{'playerIn': 8140, 'playerOut': 25747, 'minut..."
4,Played,4405654,38,"{'1644': {'scoreET': 0, 'coachId': 93112, 'sid...",181150,2018-05-13 14:00:00,1611,Old Trafford,2500093,"Manchester United - Watford, 1 - 0",...,home,1611,1,0,1,"{'bench': [{'playerId': 7918, 'ownGoals': '0',...",1,"[{'playerId': 7918, 'ownGoals': '0', 'redCards...","[{'playerId': 7939, 'ownGoals': '0', 'redCards...","[{'playerIn': 8135, 'playerOut': 7939, 'minute..."


Unnamed: 0,dtype
status,object
roundId,int64
gameweek,int64
teamsData,object
seasonId,int64
dateutc,object
winner,int64
venue,object
wyId,int64
label,object


Unnamed: 0,missing_values
team1.formation.substitutions,3
team2.formation.substitutions,2
team2.side,0
team1.formation,0
team1.scoreHT,0
team1.formation.bench,0
team1.formation.lineup,0
team2.scoreET,0
team2.coachId,0
team2.teamId,0


In [5]:
# Remove duplicate rows from events and matches
events_eng = events_eng.drop_duplicates()
matches_eng = matches_eng.drop_duplicates()

print("After duplicates removed:", events_eng.shape, matches_eng.shape)

After duplicates removed: (643090, 17) (380, 38)


In [6]:
# Convert event time and match date columns to numeric/datetime formats
if "eventSec" in events_eng.columns:
    events_eng["eventSec"] = pd.to_numeric(events_eng["eventSec"], errors="coerce")

if "dateutc" in matches_eng.columns:
    matches_eng["dateutc"] = pd.to_datetime(matches_eng["dateutc"], utc=True, errors="coerce")


# Drop rows missing essential match or team identifiers
events_eng = events_eng.dropna(subset=["matchId", "teamId"])
matches_eng = matches_eng.dropna(subset=["wyId"])

In [7]:
# Select only useful columns
events_eng_small = events_eng[[
    "matchId", "teamId", "playerId", "eventName", "subEventName", "eventSec", "tags"
]].copy()

matches_eng_small = matches_eng[[
    "wyId", "label", "dateutc"
]].copy()

# Rename to match_id + date
events_eng_small.rename(columns={"matchId": "match_id"}, inplace=True)
matches_eng_small.rename(columns={"wyId": "match_id", "dateutc": "date"}, inplace=True)

display(events_eng_small.head())
display(matches_eng_small.head())

Unnamed: 0,match_id,teamId,playerId,eventName,subEventName,eventSec,tags
0,2499719,1609,25413,Pass,Simple pass,2.758649,[{'id': 1801}]
1,2499719,1609,370224,Pass,High pass,4.94685,[{'id': 1801}]
2,2499719,1609,3319,Pass,Head pass,6.542188,[{'id': 1801}]
3,2499719,1609,120339,Pass,Head pass,8.143395,[{'id': 1801}]
4,2499719,1609,167145,Pass,Simple pass,10.302366,[{'id': 1801}]


Unnamed: 0,match_id,label,date
0,2500089,"Burnley - AFC Bournemouth, 1 - 2",2018-05-13 14:00:00+00:00
1,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0",2018-05-13 14:00:00+00:00
2,2500091,"Huddersfield Town - Arsenal, 0 - 1",2018-05-13 14:00:00+00:00
3,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0",2018-05-13 14:00:00+00:00
4,2500093,"Manchester United - Watford, 1 - 0",2018-05-13 14:00:00+00:00


In [8]:
# Merge event data with match metadata using match_id
events_merged = events_eng_small.merge(matches_eng_small, on="match_id", how="left")

print(events_merged.shape)
events_merged.head()

(643090, 9)


Unnamed: 0,match_id,teamId,playerId,eventName,subEventName,eventSec,tags,label,date
0,2499719,1609,25413,Pass,Simple pass,2.758649,[{'id': 1801}],"Arsenal - Leicester City, 4 - 3",2017-08-11 18:45:00+00:00
1,2499719,1609,370224,Pass,High pass,4.94685,[{'id': 1801}],"Arsenal - Leicester City, 4 - 3",2017-08-11 18:45:00+00:00
2,2499719,1609,3319,Pass,Head pass,6.542188,[{'id': 1801}],"Arsenal - Leicester City, 4 - 3",2017-08-11 18:45:00+00:00
3,2499719,1609,120339,Pass,Head pass,8.143395,[{'id': 1801}],"Arsenal - Leicester City, 4 - 3",2017-08-11 18:45:00+00:00
4,2499719,1609,167145,Pass,Simple pass,10.302366,[{'id': 1801}],"Arsenal - Leicester City, 4 - 3",2017-08-11 18:45:00+00:00


In [9]:
# total event counts per match
match_event_counts = events_merged.groupby("match_id").size().reset_index(name="total_events")

passes = events_merged[events_merged["eventName"] == "Pass"].groupby("match_id").size().reset_index(name="total_passes")
shots = events_merged[events_merged["eventName"] == "Shot"].groupby("match_id").size().reset_index(name="total_shots")
fouls = events_merged[events_merged["eventName"] == "Foul"].groupby("match_id").size().reset_index(name="total_fouls")

# combine into one table
match_features = match_event_counts.merge(passes, on="match_id", how="left") \
                                  .merge(shots, on="match_id", how="left") \
                                  .merge(fouls, on="match_id", how="left")

# fill NaN counts with 0
for col in ["total_passes", "total_shots", "total_fouls"]:
    match_features[col] = match_features[col].fillna(0)

# add match label (target)
match_features = match_features.merge(matches_eng_small[["match_id", "label"]], on="match_id", how="left")

match_features.head()

Unnamed: 0,match_id,total_events,total_passes,total_shots,total_fouls,label
0,2499719,1768,836,34,21,"Arsenal - Leicester City, 4 - 3"
1,2499720,1606,938,18,15,"Brighton & Hove Albion - Manchester City, 0 - 2"
2,2499721,1565,812,24,29,"Chelsea - Burnley, 2 - 3"
3,2499722,1513,643,19,26,"Crystal Palace - Huddersfield Town, 0 - 3"
4,2499723,1649,749,18,23,"Everton - Stoke City, 1 - 0"


In [10]:
# Convert match labels into clean Win/Draw/Loss outcomes
def extract_result(label):
    try:
        score = label.split(",")[-1].strip()  
        home_goals, away_goals = map(int, score.split("-"))
        if home_goals > away_goals:
            return "Win"
        elif home_goals < away_goals:
            return "Loss"
        else:
            return "Draw"
    except:
        return np.nan

match_features["result"] = match_features["label"].apply(extract_result)
match_features = match_features.dropna(subset=["result"])

match_features[["match_id","label","result"]].head()

Unnamed: 0,match_id,label,result
0,2499719,"Arsenal - Leicester City, 4 - 3",Win
1,2499720,"Brighton & Hove Albion - Manchester City, 0 - 2",Loss
2,2499721,"Chelsea - Burnley, 2 - 3",Loss
3,2499722,"Crystal Palace - Huddersfield Town, 0 - 3",Loss
4,2499723,"Everton - Stoke City, 1 - 0",Win


In [11]:
# Save the cleaned match-level feature table to CSV
out_path = os.path.join(DATA_DIR, "match_features_England_basic.csv")
match_features.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: /Users/hetavvyas/Downloads/Project_Data/match_features_England_basic.csv
