## Import Necessary frameworks and Liberaries

In [686]:
import pandas as pd
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from collections import defaultdict
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier

## Load matches.csv and teams.csv files

In [611]:
df_matches = pd.read_csv("C:\\Users\\zeesh\\Downloads\\Athena AI\\task1\\task1\\matches.csv")
df_teams = pd.read_csv("C:\\Users\\zeesh\\Downloads\\Athena AI\\task1\\task1\\teams.csv")

## Data Preprocessing & Feature Engineering

In [612]:
df_matches.drop(columns=['IdCupSeason', 'statText', 'resText', 'team1PenScore', 'team2PenScore'], inplace=True) ## Drop unnecessary columns

In [613]:
df_matches.head(3) ## Printing the first 3 rows of the matches.csv file

Unnamed: 0,date,team1,team1Text,team2,team2Text,venue,CupName,team1Score,team2Score
0,19500308,WAL,Wales,NIR,Northern Ireland,"Cardiff, Wales",FIFA competition team qualification,0.0,0.0
1,19500402,ESP,Spain,POR,Portugal,"Madrid, Spain",FIFA competition team qualification,5.0,1.0
2,19500409,POR,Portugal,ESP,Spain,"Lisbon, Portugal",FIFA competition team qualification,2.0,2.0


In [614]:
# Convert to datetime and format to YYYY-MM-DD
df_matches['date'] = pd.to_datetime(df_matches['date'], format='%Y%m%d').dt.strftime('%Y-%m-%d')

In [615]:
# Count of missing (NaN) values in each column
missing_values = df_matches.isnull().sum()
print(missing_values)

date           0
team1          0
team1Text      0
team2          0
team2Text      0
venue         63
CupName        0
team1Score    13
team2Score    13
dtype: int64


In [616]:
df_matches.dropna(inplace=True) ## Drop missing values' rows from matches.csv file

In [617]:
# Sort by date in ascending order (earliest first)
df_matches.sort_values(by='date',inplace=True)

## Defining 'venue_country', 'is_home_team1', 'is_home_team2' & 'neutral_venue' Columns

In [618]:
df_matches["venue_country"] = df_matches["venue"].str.split(", ").str[-1] ## Separate the venue_country column from the venue column 
df_matches.head(3)

Unnamed: 0,date,team1,team1Text,team2,team2Text,venue,CupName,team1Score,team2Score,venue_country
26,1950-02-17,EGY,Egypt,GRE,Greece,"Cairo, Egypt",Friendly,2.0,0.0,Egypt
27,1950-02-25,SLV,El Salvador,HAI,Haiti,"Guatemala City, Guatemala",Friendly,1.0,0.0,Guatemala
28,1950-02-26,SLV,El Salvador,CRC,Costa Rica,"Guatemala City, Guatemala",Friendly,0.0,1.0,Guatemala


In [619]:
df_matches['is_home_team1'] = df_matches.apply(lambda row: 1 if row['team1Text'] == row['venue_country'] else 0, axis=1) ## Creating Home/Away Column for team1 and team2
df_matches['is_home_team2'] = df_matches.apply(lambda row: 1 if row['team2Text'] == row['venue_country'] else 0, axis=1)

In [620]:
df_matches.drop(columns=['team1Text', 'team2Text', 'venue'], inplace=True) ## Drop the team1Text and team2Text columns

In [621]:
df_matches["neutral_venue"] = ((df_matches["is_home_team1"] == 0) & (df_matches["is_home_team2"] == 0)).astype(int) ## Defining neutral_venue column

## Merge 'confederation' and 'fifa_code' columns from teams.csv file for team1 and team2

In [622]:
# Merge for team1 (home team)
df_matches = df_matches.merge(
    df_teams[['fifa_code', 'confederation']],  # Select columns to merge
    left_on='team1',                           # Team1 column in df_matches
    right_on='fifa_code',                      # FIFA code in df_teams
    suffixes=('', '_team1')                    # Avoid column name conflicts
)

# Rename the new column to "confederation_team1"
df_matches = df_matches.rename(columns={'confederation': 'confederation_team1'})

In [623]:
# Merge for team2 (away team)
df_matches = df_matches.merge(
    df_teams[['fifa_code', 'confederation']],  # Select columns to merge
    left_on='team2',                           # Team2 column in df_matches
    right_on='fifa_code',                      # FIFA code in df_teams
    suffixes=('', '_team2')                    # Avoid column name conflicts
)

# Rename the new column to "confederation_team2"
df_matches = df_matches.rename(columns={'confederation': 'confederation_team2'})

In [624]:
df_matches.head(5)

Unnamed: 0,date,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,fifa_code,confederation_team1,fifa_code_team2,confederation_team2
0,1950-02-17,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,EGY,CAF,GRE,UEFA
1,1950-02-25,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,SLV,CONCACAF,HAI,CONCACAF
2,1950-02-26,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,SLV,CONCACAF,CRC,CONCACAF
3,1950-02-27,GUA,COL,Friendly,2.0,1.0,Guatemala,1,0,0,GUA,CONCACAF,COL,CONMEBOL
4,1950-02-27,CRC,CUW,Friendly,1.0,0.0,Guatemala,0,0,1,CRC,CONCACAF,CUW,CONCACAF


In [625]:
df_matches = df_matches.drop(columns=['fifa_code', 'fifa_code_team2']) ## These two columns are not needed anymore

## Mapping confederation names with numerical values according to the strength of each confederation 

In [626]:
conf_strength = {
    "UEFA": 5,      # Europe (historically strongest)
    "CONMEBOL": 4,  # South America
    "CONCACAF": 3,  # North/Central America & Caribbean
    "CAF": 2,       # Africa
    "AFC": 2,       # Asia
    "OFC": 1        # Oceania
}

In [627]:
df_matches["confederation_strength_team1"] = df_matches["confederation_team1"].map(conf_strength)
df_matches["confederation_strength_team2"] = df_matches["confederation_team2"].map(conf_strength)
df_matches = df_matches.drop(columns=['confederation_team1', 'confederation_team2'])

## Calculating rolling metrices for team1 and team 2

In [628]:
# Function to calculate rolling metrics for each team
def calculate_rolling_metrics(df, team_column, score_column, opponent_score_column, num_matches):
    rolling_metrics = {
        f'{team_column}_avg_goals_last_{num_matches}': [],
        f'{team_column}_avg_goals_conceded_last_{num_matches}': [],
        f'{team_column}_win_rate_last_{num_matches}': []
    }

    for i in range(len(df)):
        # Filter the past matches for the current team
        past_matches = df.iloc[:i]
        team_matches = past_matches[(past_matches['team1'] == df.iloc[i][team_column]) | (past_matches['team2'] == df.iloc[i][team_column])].tail(num_matches)

        # Calculate goals scored and conceded
        goals_scored = team_matches.apply(
            lambda row: row[score_column] if row['team1'] == df.iloc[i][team_column] else row[opponent_score_column], axis=1
        )
        goals_conceded = team_matches.apply(
            lambda row: row[opponent_score_column] if row['team1'] == df.iloc[i][team_column] else row[score_column], axis=1
        )

        # Calculate metrics
        avg_goals = goals_scored.mean() if not goals_scored.empty else 0
        avg_goals_conceded = goals_conceded.mean() if not goals_conceded.empty else 0
        win_rate = team_matches.apply(
            lambda row: 1 if (row['team1'] == df.iloc[i][team_column] and row[score_column] > row[opponent_score_column]) or
                            (row['team2'] == df.iloc[i][team_column] and row[opponent_score_column] > row[score_column]) else 0, axis=1
        ).mean() if not team_matches.empty else 0

        # Append the calculated metrics
        rolling_metrics[f'{team_column}_avg_goals_last_{num_matches}'].append(avg_goals)
        rolling_metrics[f'{team_column}_avg_goals_conceded_last_{num_matches}'].append(avg_goals_conceded)
        rolling_metrics[f'{team_column}_win_rate_last_{num_matches}'].append(win_rate)

    return rolling_metrics



In [629]:
# Calculate metrics for team1 (last 5 matches for goals and last 10 matches for win rate)
metrics_team1_last_5 = calculate_rolling_metrics(df_matches, 'team1', 'team1Score', 'team2Score', 5)
metrics_team1_last_10 = calculate_rolling_metrics(df_matches, 'team1', 'team1Score', 'team2Score', 10)

In [630]:
# Calculate metrics for team2 (last 5 matches for goals and last 10 matches for win rate)
metrics_team2_last_5 = calculate_rolling_metrics(df_matches, 'team2', 'team2Score', 'team1Score', 5)
metrics_team2_last_10 = calculate_rolling_metrics(df_matches, 'team2', 'team2Score', 'team1Score', 10)

In [631]:
# Add the new columns to the original dataframe
df_matches['team1_avg_goals_last_5'] = metrics_team1_last_5['team1_avg_goals_last_5']
df_matches['team1_avg_goals_conceded_last_5'] = metrics_team1_last_5['team1_avg_goals_conceded_last_5']
df_matches['team1_win_rate_last_10'] = metrics_team1_last_10['team1_win_rate_last_10']

df_matches['team2_avg_goals_last_5'] = metrics_team2_last_5['team2_avg_goals_last_5']
df_matches['team2_avg_goals_conceded_last_5'] = metrics_team2_last_5['team2_avg_goals_conceded_last_5']
df_matches['team2_win_rate_last_10'] = metrics_team2_last_10['team2_win_rate_last_10']


In [633]:
df_matches.head(5)

Unnamed: 0,date,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,confederation_strength_team2,team1_avg_goals_last_5,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10
0,1950-02-17,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,2,5,0.0,0.0,0.0,0.0,0.0,0.0
1,1950-02-25,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,3,3,0.0,0.0,0.0,0.0,0.0,0.0
2,1950-02-26,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,3,3,1.0,0.0,1.0,0.0,0.0,0.0
3,1950-02-27,GUA,COL,Friendly,2.0,1.0,Guatemala,1,0,0,3,4,0.0,0.0,0.0,0.0,0.0,0.0
4,1950-02-27,CRC,CUW,Friendly,1.0,0.0,Guatemala,0,0,1,3,3,1.0,0.0,1.0,0.0,0.0,0.0


## Calculate head to head features for team1 and team2

In [634]:
def calculate_h2h_features(df):
    """
    Calculate Head-to-Head (H2H) features for soccer matches:
    - h2h_win_rate_team1: Historical win rate of team1 against team2
    - h2h_avg_goals_team1: Average goals scored by team1 against team2
    """
    # Sort matches chronologically
    df = df.sort_values('date').reset_index(drop=True)
    
    # Initialize nested dictionary for cumulative stats
    h2h_stats = defaultdict(
        lambda: defaultdict(lambda: {'matches': 0, 'wins': 0, 'goals': 0})
    )
    
    # Lists to store results
    win_rates, avg_goals = [], []

    for idx, row in df.iterrows():
        team1, team2 = row['team1'], row['team2']
        score1, score2 = row['team1Score'], row['team2Score']
        
        # Get historical stats for this pair
        stats = h2h_stats[team1][team2]
        total_matches = stats['matches']
        
        # Calculate features
        if total_matches == 0:
            win_rate = 0.5  # Neutral prior if no history
            avg_goal = 0.0
        else:
            win_rate = stats['wins'] / total_matches
            avg_goal = stats['goals'] / total_matches
            
        win_rates.append(win_rate)
        avg_goals.append(avg_goal)
        
        # Update stats for future matches
        # Update for team1 vs team2 perspective
        h2h_stats[team1][team2]['matches'] += 1
        h2h_stats[team1][team2]['wins'] += 1 if score1 > score2 else 0
        h2h_stats[team1][team2]['goals'] += score1
        
        # Update reverse perspective (team2 vs team1)
        h2h_stats[team2][team1]['matches'] += 1
        h2h_stats[team2][team1]['wins'] += 1 if score2 > score1 else 0
        h2h_stats[team2][team1]['goals'] += score2
        
    df['h2h_win_rate_team1'] = win_rates  
    df['h2h_avg_goals_team1'] = avg_goals    
    return df

In [635]:
df_matches = calculate_h2h_features(df_matches)
df_matches.head(5)

Unnamed: 0,date,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,confederation_strength_team2,team1_avg_goals_last_5,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1
0,1950-02-17,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
1,1950-02-25,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,1950-02-26,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,3,3,1.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0
3,1950-02-27,GUA,COL,Friendly,2.0,1.0,Guatemala,1,0,0,3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
4,1950-02-27,CRC,CUW,Friendly,1.0,0.0,Guatemala,0,0,1,3,3,1.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0


In [636]:
# Create copies to avoid modifying original dataframe
df = df_matches.copy()
df.head(3)

Unnamed: 0,date,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,confederation_strength_team2,team1_avg_goals_last_5,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1
0,1950-02-17,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
1,1950-02-25,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
2,1950-02-26,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,3,3,1.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0


## Split date column

In [637]:
# Convert the 'date' column to datetime
df["date"] = pd.to_datetime(df["date"])

In [638]:
# Extract basic features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day

In [639]:
df = df.drop(columns=['date']) ## date column is not needed anymore

In [640]:
df.head(3)

Unnamed: 0,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,...,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1,year,month,day
0,EGY,GRE,Friendly,2.0,0.0,Egypt,1,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,17
1,SLV,HAI,Friendly,1.0,0.0,Guatemala,0,0,1,3,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,25
2,SLV,CRC,Friendly,0.0,1.0,Guatemala,0,0,1,3,...,0.0,1.0,0.0,0.0,0.0,0.5,0.0,1950,2,26


## Encoding values into numerical values

In [641]:
def encode_columns(df):
    """Encode categorical columns (team1, team2, CupName, venue_country) into numerical values."""
    df = df.copy()
    
    # Encode team1 and team2 with the same encoder
    team_encoder = OrdinalEncoder()
    all_teams = pd.concat([df['team1'], df['team2']]).unique().reshape(-1, 1)
    team_encoder.fit(all_teams)
    df['team1'] = team_encoder.transform(df[['team1']])
    df['team2'] = team_encoder.transform(df[['team2']])
    
    # Encode CupName
    cup_encoder = OrdinalEncoder()
    df['CupName'] = cup_encoder.fit_transform(df[['CupName']])
    
    # Encode venue_country
    venue_encoder = OrdinalEncoder()
    df['venue_country'] = venue_encoder.fit_transform(df[['venue_country']])
    
    return df

In [642]:
encoded_df = encode_columns(df)



In [643]:
encoded_df.head(5)

Unnamed: 0,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,...,team1_avg_goals_conceded_last_5,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1,year,month,day
0,58.0,76.0,3.0,2.0,0.0,55.0,1,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,17
1,165.0,82.0,3.0,1.0,0.0,76.0,0,0,1,3,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,25
2,165.0,46.0,3.0,0.0,1.0,76.0,0,0,1,3,...,0.0,1.0,0.0,0.0,0.0,0.5,0.0,1950,2,26
3,78.0,43.0,3.0,2.0,1.0,76.0,1,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,27
4,46.0,50.0,3.0,1.0,0.0,76.0,0,0,1,3,...,0.0,1.0,0.0,0.0,0.0,0.5,0.0,1950,2,27


In [644]:
encoded_df.dtypes

team1                              float64
team2                              float64
CupName                            float64
team1Score                         float64
team2Score                         float64
venue_country                      float64
is_home_team1                        int64
is_home_team2                        int64
neutral_venue                        int64
confederation_strength_team1         int64
confederation_strength_team2         int64
team1_avg_goals_last_5             float64
team1_avg_goals_conceded_last_5    float64
team1_win_rate_last_10             float64
team2_avg_goals_last_5             float64
team2_avg_goals_conceded_last_5    float64
team2_win_rate_last_10             float64
h2h_win_rate_team1                 float64
h2h_avg_goals_team1                float64
year                                 int32
month                                int32
day                                  int32
dtype: object

## Defining the Outcome variable

In [690]:
# Define the outcome variable for multi-class classification
def classify_outcome(row):
    if row['team1Score'] > row['team2Score']:
        return 2  # team1 wins
    elif row['team1Score'] < row['team2Score']:
        return 0  # team1 loses
    else:
        return 1  # draw

In [691]:
encoded_df['target'] = encoded_df.apply(classify_outcome, axis=1)

In [692]:
encoded_df.head(5)

Unnamed: 0,team1,team2,CupName,team1Score,team2Score,venue_country,is_home_team1,is_home_team2,neutral_venue,confederation_strength_team1,...,team1_win_rate_last_10,team2_avg_goals_last_5,team2_avg_goals_conceded_last_5,team2_win_rate_last_10,h2h_win_rate_team1,h2h_avg_goals_team1,year,month,day,target
0,58.0,76.0,3.0,2.0,0.0,55.0,1,0,0,2,...,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,17,2
1,165.0,82.0,3.0,1.0,0.0,76.0,0,0,1,3,...,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,25,2
2,165.0,46.0,3.0,0.0,1.0,76.0,0,0,1,3,...,1.0,0.0,0.0,0.0,0.5,0.0,1950,2,26,0
3,78.0,43.0,3.0,2.0,1.0,76.0,1,0,0,3,...,0.0,0.0,0.0,0.0,0.5,0.0,1950,2,27,2
4,46.0,50.0,3.0,1.0,0.0,76.0,0,0,1,3,...,1.0,0.0,0.0,0.0,0.5,0.0,1950,2,27,2


In [693]:
encoded_df.shape

(29614, 23)

## Splitting the data into train test sets

In [694]:
# Split the data into training (1950-2016) and testing (2017)
train_data = encoded_df[encoded_df['year'] <= 2016]
test_data = encoded_df[encoded_df['year'] == 2017]

# Check the size of the training and testing sets
print(f"Training set size: {len(train_data)}")
print(f"Testing set size: {len(test_data)}")


Training set size: 28849
Testing set size: 765


## Defining features and target variable

In [695]:
# Define the features and target variable
features = [
    'team1', 'team2', 'CupName', 'venue_country', 'is_home_team1', 'is_home_team2',
    'neutral_venue', 'confederation_strength_team1', 'confederation_strength_team2',
    'team1_avg_goals_last_5', 'team1_avg_goals_conceded_last_5', 'team1_win_rate_last_10',
    'team2_avg_goals_last_5', 'team2_avg_goals_conceded_last_5', 'team2_win_rate_last_10', 'h2h_avg_goals_team1', 'h2h_win_rate_team1', 'year', 'month', 'day'  
]

X_train = train_data[features]
y_train = train_data['target']  
X_test = test_data[features]
y_test = test_data['target']

## Model training and evaluation

In [699]:
# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    objective='multi:softprob',  # 'multi:softprob' for multi-class
    eval_metric='mlogloss',  # For multi-class, use mlogloss
    use_label_encoder=False,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)  # Get probabilities for each class (useful for win probabilities)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.5359477124183006
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.52      0.54       212
           1       0.37      0.21      0.27       208
           2       0.57      0.74      0.64       345

    accuracy                           0.54       765
   macro avg       0.50      0.49      0.48       765
weighted avg       0.51      0.54      0.51       765



In [None]:
# ---------------------------------------------------
# Save Model
# ---------------------------------------------------
import joblib
joblib.dump(model, 'worldcup_model.pkl')

# ---------------------------------------------------
# Feature Importance
# ---------------------------------------------------
print("\nFeature Importances:")
for name, score in zip(X.columns, model.feature_importances_):
    print(f"{name}: {score:.4f}")
