In [None]:
import os
import json
import pandas as pd

BASE_PATH = 'C:/Users/Acer/Desktop/analytics/individual project/analytics/open-data/data/events'
OUTPUT_RAW_CSV = 'C:/Users/Acer/Desktop/analytics/raw_shots_data.csv'

# Find all JSON event files
event_files = [f for f in os.listdir(BASE_PATH) if f.endswith('.json')]
print(f" Found {len(event_files)} match files.")

all_shots = []

for file_name in event_files:
    match_id = file_name.replace('.json', '')
    file_path = os.path.join(BASE_PATH, file_name)

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        df = pd.json_normalize(data)
        
        # Filter for shots only
        shots = df[df['type.name'] == 'Shot'].copy()
        if shots.empty:
            continue

        shots['match_id'] = match_id
        all_shots.append(shots)

# Combine all shots
raw_shots_df = pd.concat(all_shots, ignore_index=True)

# Save raw file
raw_shots_df.to_csv(OUTPUT_RAW_CSV, index=False)
print(f"Raw shot data saved to: {OUTPUT_RAW_CSV}")
raw_shots_df.head()


 Found 3433 match files.
Raw shot data saved to: C:/Users/Acer/Desktop/analytics/raw_shots_data.csv


Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,type.id,type.name,...,shot.redirect,goalkeeper.lost_out,player_off.permanent,goalkeeper.lost_in_play,goalkeeper.success_out,goalkeeper.success_in_play,goalkeeper.saved_to_post,half_end.early_video_end,shot.kick_off,goalkeeper.penalty_saved_to_post
0,becd7956-ce44-479e-8fc9-16a2d1f1f349,137,1,00:02:29.477,2,29,6,1.075902,16,Shot,...,,,,,,,,,,
1,9107d374-2942-4876-a14f-1b9f86901c15,262,1,00:05:39.641,5,39,12,0.807592,16,Shot,...,,,,,,,,,,
2,ddd194ca-08fb-43d0-87c2-33647f975f9f,715,1,00:15:29.059,15,29,23,0.979318,16,Shot,...,,,,,,,,,,
3,86596ddb-d824-4e5e-b18c-b4442e9ce7cf,743,1,00:16:20.072,16,20,30,0.312149,16,Shot,...,,,,,,,,,,
4,3ed2b107-be17-42d5-9d1b-25006a0e55cb,802,1,00:18:16.362,18,16,33,0.937618,16,Shot,...,,,,,,,,,,


In [None]:
import pandas as pd
from ast import literal_eval

raw_csv_path = 'C:/Users/Acer/Desktop/analytics/raw_shots_data.csv'
clean_csv_path = 'C:/Users/Acer/Desktop/analytics/clean_shots_data.csv'


df = pd.read_csv(raw_csv_path, low_memory=False)

# columns for modeling/visualization
columns_to_keep = [
    'match_id',
    'player.name',
    'team.name',
    'minute',
    'second',
    'location',
    'shot.outcome.name',
    'shot.statsbomb_xg',
    'shot.body_part.name',
    'shot.technique.name',
    'shot.type.name',
    'shot.first_time',
    'under_pressure'
]


available_columns = [col for col in columns_to_keep if col in df.columns]
df = df[available_columns].copy()

#  Clean and extract x, y from location
if 'location' in df.columns:
    try:
        df['location'] = df['location'].apply(literal_eval) 
        df[['x', 'y']] = pd.DataFrame(df['location'].tolist(), index=df.index)
        df.drop(columns='location', inplace=True)
    except Exception as e:
        print(f" Error parsing location column: {e}")

# Save to clean CSV
df.to_csv(clean_csv_path, index=False)
print(f"Clean shot data saved to: {clean_csv_path}")

df.head()


Clean shot data saved to: C:/Users/Acer/Desktop/analytics/clean_shots_data.csv


Unnamed: 0,match_id,player.name,team.name,minute,second,shot.outcome.name,shot.statsbomb_xg,shot.body_part.name,shot.technique.name,shot.type.name,shot.first_time,under_pressure,x,y
0,15946,Lionel Andrés Messi Cuccittini,Barcelona,2,29,Off T,0.076992,Right Foot,Half Volley,Open Play,True,,111.5,52.9
1,15946,Jordi Alba Ramos,Barcelona,5,39,Off T,0.051668,Left Foot,Volley,Open Play,True,,113.9,26.4
2,15946,Lionel Andrés Messi Cuccittini,Barcelona,15,29,Saved,0.016932,Left Foot,Normal,Open Play,,,93.7,34.7
3,15946,Rubén Sobrino Pozuelo,Deportivo Alavés,16,20,Off T,0.122604,Head,Normal,Open Play,,True,109.2,39.1
4,15946,Luis Alberto Suárez Díaz,Barcelona,18,16,Off T,0.041751,Right Foot,Normal,Open Play,,,107.8,24.7


In [None]:
import pandas as pd


df = pd.read_csv('C:/Users/Acer/Desktop/analytics/clean_shots_data.csv')

# Check initial nulls
print("Missing values before cleaning:")
print(df.isnull().sum())

#  Handle under_pressure = fill NaN as False which is not under pressure
df['under_pressure'] = df['under_pressure'].fillna(False)

# Handle `shot.first_time` → fill NaN as False =not first-time shot
df['shot.first_time'] = df['shot.first_time'].fillna(False)

#  Handle `shot.statsbomb_xg` → drop rows with missing xG 
df = df.dropna(subset=['shot.statsbomb_xg'])

#  Drop rows where x or y is missing = invalid location
df = df.dropna(subset=['x', 'y'])

# Fill missing categorical values with= Unknown
categorical_cols = [
    'shot.body_part.name',
    'shot.technique.name',
    'shot.type.name',
    'shot.outcome.name'
]
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")

# Convert boolean columns to actual booleans
df['under_pressure'] = df['under_pressure'].astype(bool)
df['shot.first_time'] = df['shot.first_time'].astype(bool)


print("\n Missing values after cleaning:")
print(df.isnull().sum())

# Save enhanced clean file
df.to_csv('C:/Users/Acer/Desktop/analytics/clean_shots_final.csv', index=False)
print("Final cleaned data saved to: clean_shots_final_data.csv")

df.head()


Missing values before cleaning:
match_id                   0
player.name                0
team.name                  0
minute                     0
second                     0
shot.outcome.name          0
shot.statsbomb_xg          0
shot.body_part.name        0
shot.technique.name        0
shot.type.name             0
shot.first_time        60769
under_pressure         66543
x                          0
y                          0
dtype: int64

 Missing values after cleaning:
match_id               0
player.name            0
team.name              0
minute                 0
second                 0
shot.outcome.name      0
shot.statsbomb_xg      0
shot.body_part.name    0
shot.technique.name    0
shot.type.name         0
shot.first_time        0
under_pressure         0
x                      0
y                      0
dtype: int64


  df['under_pressure'] = df['under_pressure'].fillna(False)
  df['shot.first_time'] = df['shot.first_time'].fillna(False)


Final cleaned data saved to: clean_shots_final_data.csv


Unnamed: 0,match_id,player.name,team.name,minute,second,shot.outcome.name,shot.statsbomb_xg,shot.body_part.name,shot.technique.name,shot.type.name,shot.first_time,under_pressure,x,y
0,15946,Lionel Andrés Messi Cuccittini,Barcelona,2,29,Off T,0.076992,Right Foot,Half Volley,Open Play,True,False,111.5,52.9
1,15946,Jordi Alba Ramos,Barcelona,5,39,Off T,0.051668,Left Foot,Volley,Open Play,True,False,113.9,26.4
2,15946,Lionel Andrés Messi Cuccittini,Barcelona,15,29,Saved,0.016932,Left Foot,Normal,Open Play,False,False,93.7,34.7
3,15946,Rubén Sobrino Pozuelo,Deportivo Alavés,16,20,Off T,0.122604,Head,Normal,Open Play,False,True,109.2,39.1
4,15946,Luis Alberto Suárez Díaz,Barcelona,18,16,Off T,0.041751,Right Foot,Normal,Open Play,False,False,107.8,24.7
