# Import Necessary frameworks and Liberaries

In [2]:
import pandas as pd
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from collections import defaultdict
import joblib

# Load matches.csv and teams.csv files

In [3]:
df_matches = pd.read_csv("C:\\Users\\zeesh\\Downloads\\Athena AI\\task1\\task1\\matches.csv")
df_teams = pd.read_csv("C:\\Users\\zeesh\\Downloads\\Athena AI\\task1\\task1\\teams.csv")

# Data Preprocessing & Feature Engineering

In [4]:
## Drop unnecessary columns
df_matches.drop(columns=['IdCupSeason', 'statText', 'resText', 'team1PenScore', 'team2PenScore'], inplace=True) 

# Convert to datetime and format to YYYY-MM-DD
df_matches['date'] = pd.to_datetime(df_matches['date'], format='%Y%m%d').dt.strftime('%Y-%m-%d')

## Drop missing values' rows from matches.csv file
df_matches.dropna(inplace=True) 

# Sort by date in ascending order (earliest first)
df_matches.sort_values(by='date',inplace=True)

### Defining 'venue_country', 'is_home_team1', 'is_home_team2' & 'neutral_venue' Columns

In [5]:
## Separate the venue_country column from the venue column
df_matches["venue_country"] = df_matches["venue"].str.split(", ").str[-1]  

## Creating Home/Away Column for team1 and team2
df_matches['is_home_team1'] = df_matches.apply(lambda row: 1 if row['team1Text'] == row['venue_country'] else 0, axis=1) 
df_matches['is_home_team2'] = df_matches.apply(lambda row: 1 if row['team2Text'] == row['venue_country'] else 0, axis=1)

df_matches.drop(columns=['team1Text', 'team2Text', 'venue'], inplace=True) ## Drop the team1Text and team2Text columns
df_matches["neutral_venue"] = ((df_matches["is_home_team1"] == 0) & (df_matches["is_home_team2"] == 0)).astype(int) ## Defining neutral_venue column

### Merge 'confederation' and 'fifa_code' columns from teams.csv file for team1 and team2

In [6]:
# Merge for team1 (home team)
df_matches = df_matches.merge(
    df_teams[['fifa_code', 'confederation']],  # Select columns to merge
    left_on='team1',                           # Team1 column in df_matches
    right_on='fifa_code',                      # FIFA code in df_teams
    suffixes=('', '_team1')                    # Avoid column name conflicts
)

# Rename the new column to "confederation_team1"
df_matches = df_matches.rename(columns={'confederation': 'confederation_team1'})



# Merge for team2 (away team)
df_matches = df_matches.merge(
    df_teams[['fifa_code', 'confederation']],  # Select columns to merge
    left_on='team2',                           # Team2 column in df_matches
    right_on='fifa_code',                      # FIFA code in df_teams
    suffixes=('', '_team2')                    # Avoid column name conflicts
)

# Rename the new column to "confederation_team2"
df_matches = df_matches.rename(columns={'confederation': 'confederation_team2'})

In [7]:
## These two columns are not needed anymore
df_matches = df_matches.drop(columns=['fifa_code', 'fifa_code_team2']) 

### Mapping confederation names with numerical values according to the strength of each confederation 

In [8]:
conf_strength = {
    "UEFA": 5,      # Europe (historically strongest)
    "CONMEBOL": 4,  # South America
    "CONCACAF": 3,  # North/Central America & Caribbean
    "CAF": 2,       # Africa
    "AFC": 2,       # Asia
    "OFC": 1        # Oceania
}


df_matches["confederation_strength_team1"] = df_matches["confederation_team1"].map(conf_strength)
df_matches["confederation_strength_team2"] = df_matches["confederation_team2"].map(conf_strength)
df_matches = df_matches.drop(columns=['confederation_team1', 'confederation_team2'])

### Calculating rolling metrices for team1 and team 2

In [9]:
# Function to calculate rolling metrics for each team
def calculate_rolling_metrics(df, team_column, score_column, opponent_score_column, num_matches):
    rolling_metrics = {
        f'{team_column}_avg_goals_last_{num_matches}': [],
        f'{team_column}_avg_goals_conceded_last_{num_matches}': [],
        f'{team_column}_win_rate_last_{num_matches}': []
    }

    for i in range(len(df)):
        # Filter the past matches for the current team
        past_matches = df.iloc[:i]
        team_matches = past_matches[(past_matches['team1'] == df.iloc[i][team_column]) | (past_matches['team2'] == df.iloc[i][team_column])].tail(num_matches)

        # Calculate goals scored and conceded
        goals_scored = team_matches.apply(
            lambda row: row[score_column] if row['team1'] == df.iloc[i][team_column] else row[opponent_score_column], axis=1
        )
        goals_conceded = team_matches.apply(
            lambda row: row[opponent_score_column] if row['team1'] == df.iloc[i][team_column] else row[score_column], axis=1
        )

        # Calculate metrics
        avg_goals = goals_scored.mean() if not goals_scored.empty else 0
        avg_goals_conceded = goals_conceded.mean() if not goals_conceded.empty else 0
        win_rate = team_matches.apply(
            lambda row: 1 if (row['team1'] == df.iloc[i][team_column] and row[score_column] > row[opponent_score_column]) or
                            (row['team2'] == df.iloc[i][team_column] and row[opponent_score_column] > row[score_column]) else 0, axis=1
        ).mean() if not team_matches.empty else 0

        # Append the calculated metrics
        rolling_metrics[f'{team_column}_avg_goals_last_{num_matches}'].append(avg_goals)
        rolling_metrics[f'{team_column}_avg_goals_conceded_last_{num_matches}'].append(avg_goals_conceded)
        rolling_metrics[f'{team_column}_win_rate_last_{num_matches}'].append(win_rate)

    return rolling_metrics

In [10]:
# Calculate metrics for team1 (last 5 matches for goals and last 10 matches for win rate)
metrics_team1_last_5 = calculate_rolling_metrics(df_matches, 'team1', 'team1Score', 'team2Score', 5)
metrics_team1_last_10 = calculate_rolling_metrics(df_matches, 'team1', 'team1Score', 'team2Score', 10)

# Calculate metrics for team2 (last 5 matches for goals and last 10 matches for win rate)
metrics_team2_last_5 = calculate_rolling_metrics(df_matches, 'team2', 'team2Score', 'team1Score', 5)
metrics_team2_last_10 = calculate_rolling_metrics(df_matches, 'team2', 'team2Score', 'team1Score', 10)

In [11]:
# Add the new columns to the original dataframe
df_matches['team1_avg_goals_last_5'] = metrics_team1_last_5['team1_avg_goals_last_5']
df_matches['team1_avg_goals_conceded_last_5'] = metrics_team1_last_5['team1_avg_goals_conceded_last_5']
df_matches['team1_win_rate_last_10'] = metrics_team1_last_10['team1_win_rate_last_10']

df_matches['team2_avg_goals_last_5'] = metrics_team2_last_5['team2_avg_goals_last_5']
df_matches['team2_avg_goals_conceded_last_5'] = metrics_team2_last_5['team2_avg_goals_conceded_last_5']
df_matches['team2_win_rate_last_10'] = metrics_team2_last_10['team2_win_rate_last_10']

### Calculate head to head features for team1 and team2

In [14]:
def calculate_h2h_features(df):
    """
    Calculate Head-to-Head (H2H) features for soccer matches:
    - h2h_win_rate_team1: Historical win rate of team1 against team2
    - h2h_avg_goals_team1: Average goals scored by team1 against team2
    - h2h_win_rate_team2: Historical win rate of team2 against team1
    - h2h_avg_goals_team2: Average goals scored by team2 against team1
    """
    # Sort matches chronologically
    df = df.sort_values('date').reset_index(drop=True)
    
    # Initialize nested dictionary for cumulative stats
    h2h_stats = defaultdict(
        lambda: defaultdict(lambda: {'matches': 0, 'wins': 0, 'goals': 0}))
    
    # Lists to store results
    win_rates_team1, avg_goals_team1 = [], []
    win_rates_team2, avg_goals_team2 = [], []

    for idx, row in df.iterrows():
        team1, team2 = row['team1'], row['team2']
        score1, score2 = row['team1Score'], row['team2Score']
        
        # Get historical stats for team1 vs team2
        stats_team1 = h2h_stats[team1][team2]
        total_matches_team1 = stats_team1['matches']
        
        # Get historical stats for team2 vs team1
        stats_team2 = h2h_stats[team2][team1]
        total_matches_team2 = stats_team2['matches']
        
        # Calculate features for team1 vs team2
        if total_matches_team1 == 0:
            win_rate_team1 = 0.5  # Neutral prior if no history
            avg_goal_team1 = 0.0
        else:
            win_rate_team1 = stats_team1['wins'] / total_matches_team1
            avg_goal_team1 = stats_team1['goals'] / total_matches_team1
            
        # Calculate features for team2 vs team1
        if total_matches_team2 == 0:
            win_rate_team2 = 0.5  # Neutral prior if no history
            avg_goal_team2 = 0.0
        else:
            win_rate_team2 = stats_team2['wins'] / total_matches_team2
            avg_goal_team2 = stats_team2['goals'] / total_matches_team2
            
        # Append features to lists
        win_rates_team1.append(win_rate_team1)
        avg_goals_team1.append(avg_goal_team1)
        win_rates_team2.append(win_rate_team2)
        avg_goals_team2.append(avg_goal_team2)
        
        # Update stats for future matches
        # Update for team1 vs team2 perspective
        h2h_stats[team1][team2]['matches'] += 1
        h2h_stats[team1][team2]['wins'] += 1 if score1 > score2 else 0
        h2h_stats[team1][team2]['goals'] += score1
        
        # Update reverse perspective (team2 vs team1)
        h2h_stats[team2][team1]['matches'] += 1
        h2h_stats[team2][team1]['wins'] += 1 if score2 > score1 else 0
        h2h_stats[team2][team1]['goals'] += score2
        
    # Add features to DataFrame
    df['h2h_win_rate_team1'] = win_rates_team1
    df['h2h_avg_goals_team1'] = avg_goals_team1
    df['h2h_win_rate_team2'] = win_rates_team2
    df['h2h_avg_goals_team2'] = avg_goals_team2
    
    return df

In [15]:
df_matches = calculate_h2h_features(df_matches)

In [18]:
# Create copies to avoid modifying original dataframe
df = df_matches.copy()
df.head(3)

Unnamed: 0,date,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,...,team1_avg_goals_last_5,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1,h2h_win_rate_team2,h2h_avg_goals_team2
0,1950-02-17,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0
1,1950-02-25,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0
2,1950-02-26,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,...,1.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0


### Split date column

In [19]:
# Convert the 'date' column to datetime
df["date"] = pd.to_datetime(df["date"])

# Extract basic features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day

df = df.drop(columns=['date']) ## date column is not needed anymore

### Encoding values into numerical values

In [20]:
def encode_columns(df, save_encoders=False):
    """Encode categorical columns and optionally save encoders"""
    df = df.copy()
    
    # Encode team1 and team2 with shared encoder
    team_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    all_teams = pd.concat([df['team1'], df['team2']]).unique().reshape(-1, 1)
    team_encoder.fit(all_teams)
    
    df['team1'] = team_encoder.transform(df[['team1']])
    df['team2'] = team_encoder.transform(df[['team2']])
    
    # Encode other columns
    cup_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df['CupName'] = cup_encoder.fit_transform(df[['CupName']])
    
    venue_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df['venue_country'] = venue_encoder.fit_transform(df[['venue_country']])
    
    if save_encoders:
        joblib.dump(team_encoder, 'team_encoder.joblib')
        joblib.dump(cup_encoder, 'cup_encoder.joblib')
        joblib.dump(venue_encoder, 'venue_encoder.joblib')
    
    return df

# During training
encoded_df = encode_columns(df, save_encoders=True)



In [21]:
# Save the dataframe using pickle
encoded_df.to_pickle('preprocessed_historical_data.pkl')

# Defining the Outcome variable

In [22]:
# Define the outcome variable for multi-class classification
def classify_outcome(row):
    if row['team1Score'] > row['team2Score']:
        return 2  # team1 wins
    elif row['team1Score'] < row['team2Score']:
        return 0  # team1 loses
    else:
        return 1  # draw

encoded_df['target'] = encoded_df.apply(classify_outcome, axis=1)        

# Splitting the data into train test sets

In [23]:
# Split the data into training (1950-2016) and testing (2017)
train_data = encoded_df[encoded_df['year'] <= 2016]
test_data = encoded_df[encoded_df['year'] == 2017]

# Check the size of the training and testing sets
print(f"Training set size: {len(train_data)}")
print(f"Testing set size: {len(test_data)}")


Training set size: 28849
Testing set size: 765


# Defining features and target variable

In [32]:
# Define the features and target variable
features = [
    'team1', 'team2', 'CupName', 'venue_country', 'is_home_team1', 'is_home_team2',
    'neutral_venue', 'confederation_strength_team1', 'confederation_strength_team2',
    'team1_avg_goals_last_5', 'team1_avg_goals_conceded_last_5', 'team1_win_rate_last_10',
    'team2_avg_goals_last_5', 'team2_avg_goals_conceded_last_5', 'team2_win_rate_last_10', 'h2h_win_rate_team1', 'h2h_avg_goals_team1', 'h2h_win_rate_team2', 'h2h_avg_goals_team2', 'year', 'month', 'day'  
]

X_train = train_data[features]
y_train = train_data['target']  
X_test = test_data[features]
y_test = test_data['target']

# Model training and evaluation

In [33]:
# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    objective='multi:softprob',  # 'multi:softprob' for multi-class
    eval_metric='mlogloss',  # For multi-class, use mlogloss
    use_label_encoder=False,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    num_class=3  # Specify the number of classes: 3 (win, draw, lose)
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)  # Get probabilities for each class (useful for win probabilities)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.52
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.48      0.51       212
           1       0.36      0.20      0.26       208
           2       0.56      0.74      0.64       345

    accuracy                           0.52       765
   macro avg       0.48      0.48      0.47       765
weighted avg       0.50      0.52      0.50       765



In [31]:
# Save Model
joblib.dump(model, 'worldcup_model.pkl')

['worldcup_model.pkl']