In [1]:
# ------------------------------------------------------
# 1. Load data
# ------------------------------------------------------
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

matches_weather = pd.read_csv('../data-processed/matches_clean_weather.csv')
squads = pd.read_csv('../data-processed/squads_clean.csv')
goals = pd.read_csv('../data-processed/goals_clean.csv')
bookings = pd.read_csv('../data-processed/bookings_clean.csv')
substitutions = pd.read_csv('../data-processed/substitutions_clean.csv')
matchbox = pd.read_csv('../data-processed/MatchBoxData_clean.csv')

print("All datasets loaded.")

All datasets loaded.


In [2]:
# ------------------------------------------------------
# 2. Merge pre-match context
# ------------------------------------------------------
# 1. Extract year
matches_weather['year'] = matches_weather['tournament_id'].apply(lambda x: int(x.split('-')[1]))
squads['year'] = pd.to_datetime(squads['rank_date']).dt.year

# 2. Name standardisation
name_map = {
    "Republic of Ireland": "Republic Of Ireland", 
    "Trinidad and Tobago": "Trinidad And Tobago",
    "Serbia and Montenegro": "Serbia And Montenegro",
    "Bosnia and Herzegovina": "Bosnia And Herzegovina",
    "China": "China PR", "China PR": "China PR",    
    "North Korea": "Korea DPR", "Korea DPR": "Korea DPR",
    "South Korea": "Korea Republic", "Korea Republic": "Korea Republic",
    "IR Iran": "Iran", "IR Iran": "IR Iran",
    "Ivory Coast": "Côte d'Ivoire", "Ivory Coast": "Ivory Coast"
}
matches_weather['home_team_name'] = matches_weather['home_team_name'].replace(name_map)
matches_weather['away_team_name'] = matches_weather['away_team_name'].replace(name_map)

# 3. Merge ranks and ages
# We rename median_age to home_age/away_age here so they exist in matches_merged
matches_merged = pd.merge(matches_weather, squads[['country_full', 'year', 'rank', 'median_age']], 
                          left_on=['home_team_name', 'year'], right_on=['country_full', 'year'], how='left').rename(columns={'rank': 'home_rank', 'median_age': 'home_age'})

matches_merged = pd.merge(matches_merged, squads[['country_full', 'year', 'rank', 'median_age']], 
                          left_on=['away_team_name', 'year'], right_on=['country_full', 'year'], how='left').rename(columns={'rank': 'away_rank', 'median_age': 'away_age'})

# 4. Calculate diffs (We keep these and the raw ages now)
matches_merged['rank_diff'] = (matches_merged['away_rank'] - matches_merged['home_rank']).fillna(0)
matches_merged['age_diff'] = (matches_merged['home_age'] - matches_merged['away_age']).fillna(0)

print("Pre-match variables ready (including raw ages).")

Pre-match variables ready (including raw ages).


In [3]:
# ------------------------------------------------------
# 3. Engine
# ------------------------------------------------------
def get_weather_val(minutes_from_start_hour, val0, val1, val2):
    if minutes_from_start_hour < 60: return val0
    elif minutes_from_start_hour < 120: return val1
    else: return val2

def create_15min_chunks(matches_df, goals_df, bookings_df, subs_df):
    chunk_rows = []
    intervals = [15, 30, 45, 60, 75, 90]
    
    for _, match in matches_df.iterrows():
        m_id = match['match_id']
        ko_time = pd.to_datetime(match['match_time'], format='%H:%M')
        ko_minute_offset = ko_time.minute
        
        # Filter events
        m_goals = goals_df[goals_df['match_id'] == m_id]
        m_cards = bookings_df[bookings_df['match_id'] == m_id]
        m_subs = subs_df[subs_df['match_id'] == m_id]
        
        for minute_limit in intervals:
            # Time & Weather logic
            is_second_half = minute_limit > 45
            halftime_add = 15 if is_second_half else 0
            real_time_minutes = ko_minute_offset + minute_limit + halftime_add
            
            curr_temp = get_weather_val(real_time_minutes, match['temp0'], match['temp1'], match['temp2'])
            curr_humid = get_weather_val(real_time_minutes, match['humid0'], match['humid1'], match['humid2'])
            curr_wind = get_weather_val(real_time_minutes, match['wind0'], match['wind1'], match['wind2'])
            
            # Dynamic State
            home_goals_now = len(m_goals[(m_goals['minute_regulation'] <= minute_limit) & (m_goals['home_team'] == 1)])
            away_goals_now = len(m_goals[(m_goals['minute_regulation'] <= minute_limit) & (m_goals['away_team'] == 1)])
            
            home_reds = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['home_team'] == 1) & (m_cards['is_red'] == True)])
            away_reds = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['away_team'] == 1) & (m_cards['is_red'] == True)])
            
            home_yellows = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['home_team'] == 1) & (m_cards['is_yellow'] == True)])
            away_yellows = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['away_team'] == 1) & (m_cards['is_yellow'] == True)])
            
            home_subs_count = len(m_subs[(m_subs['minute_regulation'] <= minute_limit) & (m_subs['home_team'] == 1)])
            away_subs_count = len(m_subs[(m_subs['minute_regulation'] <= minute_limit) & (m_subs['away_team'] == 1)])
            
            row = {
                'match_id': m_id,
                'tournament_id': match['tournament_id'],
                'home_team_name': match['home_team_name'],
                'away_team_name': match['away_team_name'],
                'chunk_minute': minute_limit,
                'time_remaining': 90 - minute_limit,
                
                # Dynamic Diffs
                'score_diff': home_goals_now - away_goals_now,
                'red_diff': away_reds - home_reds,
                'yellow_diff': away_yellows - home_yellows,
                
                # Totals
                'game_total_reds': home_reds + away_reds,       # <--- Added
                'game_total_yellows': home_yellows + away_yellows,
                'total_subs': home_subs_count + away_subs_count, # <--- Added
                'home_subs': home_subs_count,
                'away_subs': away_subs_count,
                
                # Weather & Context
                'temperature': curr_temp,
                'apparent_temp': get_weather_val(real_time_minutes, match['atemp0'], match['atemp1'], match['atemp2']),
                'humidity': curr_humid,
                'wind_speed': curr_wind,
                'pre_match_rank_diff': match['rank_diff'],
                'kickoff_hour': ko_time.hour,
                'is_extra_time': 1 if match['extra_time'] == 1 else 0,
                'is_penalty_shootout': 1 if match['penalty_shootout'] == 1 else 0,
                
                # Raw ages
                'squad_age_diff': match['age_diff'],
                'home_age': match['home_age'], # <--- Added
                'away_age': match['away_age'], # <--- Added
                
                'target_home_win': 1 if match['home_team_score'] > match['away_team_score'] else 0
            }
            chunk_rows.append(row)
            
    return pd.DataFrame(chunk_rows)

chunk_data = create_15min_chunks(matches_merged, goals, bookings, substitutions)
print(f"Generated {len(chunk_data)} rows.")

Generated 2304 rows.


In [4]:
# ------------------------------------------------------
# 4. Integrate MatchBox
# ------------------------------------------------------
# 1. Select MatchBox Data
matchbox_totals = matchbox[['tournament_id', 'hname', 'aname', 'hshots', 'ashots', 'hshotsOnTarget', 'ashotsOnTarget', 'hPossession', 'hfouls', 'afouls']].copy()
matchbox_totals.drop_duplicates(subset=['tournament_id', 'hname', 'aname'], inplace=True)

# 2. Standardise Names
name_map_mb = {
    "Bosnia and Herzegovina": "Bosnia And Herzegovina", "China": "China PR", "North Korea": "Korea DPR",
    "South Korea": "Korea Republic", "Republic of Ireland": "Republic Of Ireland", "Serbia & Montenegro": "Serbia And Montenegro", 
    "Trinidad and Tobago": "Trinidad And Tobago", "Trinidad & Tobago": "Trinidad And Tobago", "Iran": "IR Iran",
    "Ivory Coast": "Ivory Coast", "Côte d'Ivoire": "Ivory Coast", "USA": "United States", "United States": "United States",
    "Saudi Arabia": "Saudi Arabia", "Angola": "Angola"
}
matchbox_totals['hname'] = matchbox_totals['hname'].replace(name_map_mb)
matchbox_totals['aname'] = matchbox_totals['aname'].replace(name_map_mb)

# 3. Merge
merged_standard = pd.merge(chunk_data, matchbox_totals, left_on=['tournament_id', 'home_team_name', 'away_team_name'], right_on=['tournament_id', 'hname', 'aname'], how='left')

# 4. Recovery Logic (Swap hosts)
valid_rows = merged_standard[merged_standard['hPossession'].notna()]
failed_rows = merged_standard[merged_standard['hPossession'].isna()].copy()
failed_rows.drop(columns=['hname', 'aname', 'hshots', 'ashots', 'hshotsOnTarget', 'ashotsOnTarget', 'hPossession', 'hfouls', 'afouls'], inplace=True)

merged_swapped = pd.merge(failed_rows, matchbox_totals, left_on=['tournament_id', 'home_team_name', 'away_team_name'], right_on=['tournament_id', 'aname', 'hname'], how='inner')

merged_swapped = merged_swapped.rename(columns={
    'hshots': 'ashots', 'ashots': 'hshots',
    'hshotsOnTarget': 'ashotsOnTarget', 'ashotsOnTarget': 'hshotsOnTarget',
    'hfouls': 'afouls', 'afouls': 'hfouls'
})
merged_swapped['hPossession'] = 100 - merged_swapped['hPossession']

final_dataset = pd.concat([valid_rows, merged_swapped], ignore_index=True)

# 5. Fix Possession (0/100 -> 50)
final_dataset.loc[final_dataset['hPossession'] <= 0, 'hPossession'] = np.nan
final_dataset.loc[final_dataset['hPossession'] >= 100, 'hPossession'] = np.nan
final_dataset['hPossession'].fillna(50.0, inplace=True)

# Calculate totals
final_dataset['game_total_shots'] = final_dataset['hshots'] + final_dataset['ashots']
final_dataset['game_total_sot'] = final_dataset['hshotsOnTarget'] + final_dataset['ashotsOnTarget']
final_dataset['game_total_fouls'] = final_dataset['hfouls'] + final_dataset['afouls']

# Also calculate the diffs (Legacy support)
final_dataset['total_shot_diff'] = final_dataset['hshots'] - final_dataset['ashots']
final_dataset['total_sot_diff'] = final_dataset['hshotsOnTarget'] - final_dataset['ashotsOnTarget']
final_dataset['foul_diff'] = final_dataset['afouls'] - final_dataset['hfouls'] 

# Drop only names and raw single-team stats 
# (We keep 'home_age', 'away_age', 'game_total_reds', 'total_subs' because they are already in the df)
cols_cleanup = ['hname', 'aname', 'hshots', 'ashots', 'hshotsOnTarget', 'ashotsOnTarget', 'hfouls', 'afouls']
final_dataset.drop(columns=cols_cleanup, inplace=True)

final_dataset.sort_values(['tournament_id', 'match_id', 'chunk_minute'], inplace=True)

# Save
final_dataset.to_csv('../data-processed/hypothesis_data.csv', index=False)
print("Saved: hypothesis_data.csv")

Saved: hypothesis_data.csv
