In [1]:
# ------------------------------------------------------
# 1. Load data
# ------------------------------------------------------
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

matches_weather = pd.read_csv('../data-processed/matches_clean_weather.csv')
squads = pd.read_csv('../data-processed/squads_clean.csv')
goals = pd.read_csv('../data-processed/goals_clean.csv')
bookings = pd.read_csv('../data-processed/bookings_clean.csv')
substitutions = pd.read_csv('../data-processed/substitutions_clean.csv')
matchbox = pd.read_csv('../data-processed/MatchBoxData_clean.csv')

print("All datasets loaded.")

All datasets loaded.


In [4]:
# ------------------------------------------------------
# 2. Merge pre-match rankings
# ------------------------------------------------------
# 1. Extract year for merging
matches_weather['year'] = matches_weather['tournament_id'].apply(lambda x: int(x.split('-')[1]))
squads['year'] = pd.to_datetime(squads['rank_date']).dt.year

# 2. Name standardisation
name_map = {
    # Fixes for capitalisation "And"/"Of"
    "Republic of Ireland": "Republic Of Ireland", 
    "Trinidad and Tobago": "Trinidad And Tobago",
    "Serbia and Montenegro": "Serbia And Montenegro",
    "Bosnia and Herzegovina": "Bosnia And Herzegovina",
    
    # Fixes for Asian teams
    "China": "China PR",     
    "China PR": "China PR",    
    "North Korea": "Korea DPR",
    "Korea DPR": "Korea DPR",
    "South Korea": "Korea Republic", 
    "Korea Republic": "Korea Republic",

    # Other common discrepancies
    "IR Iran": "Iran",      # Or 'Iran' -> 'IR Iran' depending on squads
    "Iran": "IR Iran",
    "Ivory Coast": "Côte d'Ivoire",
    "Côte d'Ivoire": "Ivory Coast"
}

# Apply the map to update the names in the dataframe
matches_weather['home_team_name'] = matches_weather['home_team_name'].replace(name_map)
matches_weather['away_team_name'] = matches_weather['away_team_name'].replace(name_map)

# 3. Merge home team rank
matches_merged = pd.merge(
    matches_weather, 
    squads[['country_full', 'year', 'rank']], 
    left_on=['home_team_name', 'year'], 
    right_on=['country_full', 'year'], 
    how='left'
).rename(columns={'rank': 'home_rank'})

# 4. Merge away team rank
matches_merged = pd.merge(
    matches_merged, 
    squads[['country_full', 'year', 'rank']], 
    left_on=['away_team_name', 'year'], 
    right_on=['country_full', 'year'], 
    how='left'
).rename(columns={'rank': 'away_rank'})

missing_home = matches_merged[matches_merged['home_rank'].isna()]['home_team_name'].unique()
missing_away = matches_merged[matches_merged['away_rank'].isna()]['away_team_name'].unique()

print("--- Diagnostic report ---")
if len(missing_home) > 0 or len(missing_away) > 0:
    print("WARNING: These teams still have no rank:")
    if len(missing_home) > 0: print(f"Missing Home: {list(missing_home)}")
    if len(missing_away) > 0: print(f"Missing Away: {list(missing_away)}")
else:
    print("All teams matched perfectly")

# 5. Calculate difference
matches_merged['rank_diff'] = matches_merged['away_rank'] - matches_merged['home_rank']
matches_merged['rank_diff'] = matches_merged['rank_diff'].fillna(0)

print("Rankings merged successfully.")

--- Diagnostic report ---
All teams matched perfectly
Rankings merged successfully.


In [5]:
# ------------------------------------------------------
# 3. Match intervals
# ------------------------------------------------------
def get_weather_val(minutes_from_start_hour, val0, val1, val2):
    """
    Select correct hourly reading based on how many minutes have passed 
    since the kick-off HOUR started.
    """
    if minutes_from_start_hour < 60:
        return val0
    elif minutes_from_start_hour < 120:
        return val1
    else:
        # Matches starting at :30 or going to extra-time often hit the 3rd hour
        return val2

def create_15min_chunks(matches_df, goals_df, bookings_df, subs_df):
    chunk_rows = []
    intervals = [15, 30, 45, 60, 75, 90]
    
    for _, match in matches_df.iterrows():
        m_id = match['match_id']
        
        # 1. Calculate kick-off minute offset
        # match_time is like "20:30:00" -> extract 30
        ko_minute_offset = pd.to_datetime(match['match_time'], format='%H:%M').minute
        
        # Filter events
        m_goals = goals_df[goals_df['match_id'] == m_id]
        m_cards = bookings_df[bookings_df['match_id'] == m_id]
        m_subs = subs_df[subs_df['match_id'] == m_id]
        
        # Static context
        rank_diff = match['rank_diff']
        final_home_win = 1 if match['home_team_score'] > match['away_team_score'] else 0
        
        for minute_limit in intervals:
            # --- Time calculation ---
            # Real minutes passed since kick-off hour started
            # Example: Kickoff 14:30. Chunk 60.
            # Time = 30 (offset) + 60 (game) + 15 (halftime) = 105 mins.
            # 105 mins is inside "Hour+1" (60-120), so we use index 1.
            
            is_second_half = minute_limit > 45
            halftime_add = 15 if is_second_half else 0
            
            real_time_minutes = ko_minute_offset + minute_limit + halftime_add
            
            # --- Weather mapping ---
            # Fetch all 4 variables using helper function
            curr_temp = get_weather_val(real_time_minutes, match['temp0'], match['temp1'], match['temp2'])
            curr_atemp = get_weather_val(real_time_minutes, match['atemp0'], match['atemp1'], match['atemp2'])
            curr_humid = get_weather_val(real_time_minutes, match['humid0'], match['humid1'], match['humid2'])
            curr_wind = get_weather_val(real_time_minutes, match['wind0'], match['wind1'], match['wind2'])
            
            # --- Dynamic game state ---
            # 1. Goals
            home_goals_now = len(m_goals[(m_goals['minute_regulation'] <= minute_limit) & (m_goals['home_team'] == 1)])
            away_goals_now = len(m_goals[(m_goals['minute_regulation'] <= minute_limit) & (m_goals['away_team'] == 1)])
            score_diff = home_goals_now - away_goals_now 
            
            # 2. Cards
            home_reds = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['home_team'] == 1) & (m_cards['is_red'] == True)])
            away_reds = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['away_team'] == 1) & (m_cards['is_red'] == True)])
            home_yellows = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['home_team'] == 1) & (m_cards['is_yellow'] == True)])
            away_yellows = len(m_cards[(m_cards['card_minute'] <= minute_limit) & (m_cards['away_team'] == 1) & (m_cards['is_yellow'] == True)])
            
            # 3. Substitutions
            home_subs = len(m_subs[(m_subs['minute_regulation'] <= minute_limit) & (m_subs['home_team'] == 1)])
            away_subs = len(m_subs[(m_subs['minute_regulation'] <= minute_limit) & (m_subs['away_team'] == 1)])
            
            # --- Build row ---
            row = {
                'match_id': m_id,
                'tournament_id': match['tournament_id'],
                'home_team_name': match['home_team_name'],
                'away_team_name': match['away_team_name'],
                'chunk_minute': minute_limit,
                'time_remaining': 90 - minute_limit,
                
                # Dynamic state
                'score_diff': score_diff,
                'red_diff': away_reds - home_reds, # Positive = Advantage Home
                'yellow_diff': away_yellows - home_yellows,
                'home_subs': home_subs,
                'away_subs': away_subs,
                
                # Weather
                'temperature': curr_temp,
                'apparent_temp': curr_atemp,
                'humidity': curr_humid,
                'wind_speed': curr_wind,
                
                # Static context
                'pre_match_rank_diff': rank_diff,
                
                # Target
                'target_home_win': final_home_win
            }
            chunk_rows.append(row)
            
    return pd.DataFrame(chunk_rows)

# Run it
chunk_data = create_15min_chunks(matches_merged, goals, bookings, substitutions)
print(f"Generated {len(chunk_data)} rows.")
chunk_data.head()

Generated 2304 rows.


Unnamed: 0,match_id,tournament_id,home_team_name,away_team_name,chunk_minute,time_remaining,score_diff,red_diff,yellow_diff,home_subs,away_subs,temperature,apparent_temp,humidity,wind_speed,pre_match_rank_diff,target_home_win
0,M-2002-01,WC-2002,France,Senegal,15,75,0,0,0,0,0,17.914,18.616817,90.94624,10.483357,41,0
1,M-2002-01,WC-2002,France,Senegal,30,60,-1,0,0,0,0,17.214,17.909658,93.252205,9.693296,41,0
2,M-2002-01,WC-2002,France,Senegal,45,45,-1,0,0,0,0,17.214,17.909658,93.252205,9.693296,41,0
3,M-2002-01,WC-2002,France,Senegal,60,30,-1,0,0,1,0,17.214,17.909658,93.252205,9.693296,41,0
4,M-2002-01,WC-2002,France,Senegal,75,15,-1,0,0,1,0,16.964,17.61865,93.53798,9.422101,41,0


In [6]:
# ------------------------------------------------------
# 4. Integrate match context
# ------------------------------------------------------
# 1. Select totals from MatchBox
matchbox_totals = matchbox[['tournament_id', 'hname', 'aname', 
                           'hshots', 'ashots', 
                           'hshotsOnTarget', 'ashotsOnTarget', 
                           'hPossession', 
                           'hfouls', 'afouls']]

# 2. Merge into the chunk data
# We match on tournament + team names
final_dataset = pd.merge(
    chunk_data,
    matchbox_totals,
    left_on=['tournament_id', 'home_team_name', 'away_team_name'],
    right_on=['tournament_id', 'hname', 'aname'],
    how='left'
)

# 3. Calculate context differentials (positive = home dominance)
# These represent the general flow of the game, separate from the minute-by-minute score
final_dataset['total_shot_diff'] = final_dataset['hshots'] - final_dataset['ashots']
final_dataset['total_sot_diff'] = final_dataset['hshotsOnTarget'] - final_dataset['ashotsOnTarget']
final_dataset['foul_diff'] = final_dataset['afouls'] - final_dataset['hfouls'] # Positive = home fouled less (better discipline)

# 4. Cleanup
# Drop names used for merging and raw totals
# Keep 'hPossession' as is, because there's no "diff" for possession (it's zero-sum)
cols_to_drop = ['hname', 'aname', 'hshots', 'ashots', 'hshotsOnTarget', 'ashotsOnTarget', 'hfouls', 'afouls']
final_dataset.drop(columns=cols_to_drop, inplace=True)

# 5. Save for following notebooks
final_dataset.to_csv('../data-processed/win_prob_training_data.csv', index=False)
print(f"Dataset saved with {final_dataset.shape[0]} rows.")
final_dataset.head()

Dataset saved with 2316 rows.


Unnamed: 0,match_id,tournament_id,home_team_name,away_team_name,chunk_minute,time_remaining,score_diff,red_diff,yellow_diff,home_subs,...,temperature,apparent_temp,humidity,wind_speed,pre_match_rank_diff,target_home_win,hPossession,total_shot_diff,total_sot_diff,foul_diff
0,M-2002-01,WC-2002,France,Senegal,15,75,0,0,0,0,...,17.914,18.616817,90.94624,10.483357,41,0,0.0,10.0,2.0,3.0
1,M-2002-01,WC-2002,France,Senegal,30,60,-1,0,0,0,...,17.214,17.909658,93.252205,9.693296,41,0,0.0,10.0,2.0,3.0
2,M-2002-01,WC-2002,France,Senegal,45,45,-1,0,0,0,...,17.214,17.909658,93.252205,9.693296,41,0,0.0,10.0,2.0,3.0
3,M-2002-01,WC-2002,France,Senegal,60,30,-1,0,0,1,...,17.214,17.909658,93.252205,9.693296,41,0,0.0,10.0,2.0,3.0
4,M-2002-01,WC-2002,France,Senegal,75,15,-1,0,0,1,...,16.964,17.61865,93.53798,9.422101,41,0,0.0,10.0,2.0,3.0
