## Install required libraries

In [14]:
!pip install kagglehub
!pip install pandas
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.me

## Import Data

In [2]:
import kagglehub

path = kagglehub.dataset_download("martj42/international-football-results-from-1872-to-2017")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/martj42/international-football-results-from-1872-to-2017?dataset_version_number=93...


100%|███████████████████████████████████████| 1.15M/1.15M [00:01<00:00, 915kB/s]

Extracting files...
Path to dataset files: /home/aditya/.cache/kagglehub/datasets/martj42/international-football-results-from-1872-to-2017/versions/93





In [6]:
import pandas as pd

former_names = pd.read_csv(f"{path}/former_names.csv")
goalscorers = pd.read_csv(f"{path}/goalscorers.csv")
results = pd.read_csv(f"{path}/results.csv")
shootouts = pd.read_csv(f"{path}/shootouts.csv")

# former_names.head()
# goalscorers.head()
# results.head()
# shootouts.head()

Unnamed: 0,date,home_team,away_team,winner,first_shooter
0,1967-08-22,India,Taiwan,Taiwan,
1,1971-11-14,South Korea,Vietnam Republic,South Korea,
2,1972-05-07,South Korea,Iraq,Iraq,
3,1972-05-17,Thailand,South Korea,South Korea,
4,1972-05-19,Thailand,Cambodia,Thailand,


## Add result Label to the dataset

In [10]:
print(results.dtypes)

def match_result(row):
    if row["home_score"] > row["away_score"]:
        return "home_win"
    elif row["home_score"] < row["away_score"]:
        return "away_win"
    else:
        return "draw"

results["match_result"] = results.apply(match_result, axis=1)

results.head()

date            object
home_team       object
away_team       object
home_score       int64
away_score       int64
tournament      object
city            object
country         object
neutral           bool
match_result    object
dtype: object


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,match_result
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,home_win
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,home_win
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,home_win


## Feature Engineering

### Basic Features

In [11]:
features = results[["home_team", "away_team", "tournament", "neutral"]]
labels = results["match_result"]

### Encoding

In [12]:
X = pd.get_dummies(features)

### Train-test split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, shuffle=False
)

### Simple Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    away_win       0.46      0.45      0.46      2811
        draw       0.27      0.19      0.22      2225
    home_win       0.59      0.69      0.64      4606

    accuracy                           0.50      9642
   macro avg       0.44      0.44      0.44      9642
weighted avg       0.48      0.50      0.49      9642



### Since the overall accuracy barely passes over 50%, lets try adding a FIFA ranking for the teams and see if it helps

In [19]:
ranking_path = kagglehub.dataset_download("cashncarry/fifaworldranking")
print(ranking_path)

/home/aditya/.cache/kagglehub/datasets/cashncarry/fifaworldranking/versions/15


In [23]:
ranking = pd.read_csv(f"{ranking_path}/fifa_ranking-2024-06-20.csv")
ranking.head()

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,140.0,Brunei Darussalam,BRU,2.0,0.0,140,AFC,1992-12-31
1,33.0,Portugal,POR,38.0,0.0,33,UEFA,1992-12-31
2,32.0,Zambia,ZAM,38.0,0.0,32,CAF,1992-12-31
3,31.0,Greece,GRE,38.0,0.0,31,UEFA,1992-12-31
4,30.0,Algeria,ALG,39.0,0.0,30,CAF,1992-12-31


### Cleaning up the data

In [24]:
ranking["rank_date"] = pd.to_datetime(ranking["rank_date"])

In [25]:
results["date"] = pd.to_datetime(results["date"])

### Rename the columns to merge

In [28]:
home_rankings = ranking.rename(columns={
    "country_full": "home_team",
    "rank": "home_rank",
    "rank_date": "date"
})

away_rankings = ranking.rename(columns={
    "country_full": "away_team",
    "rank": "away_rank",
    "rank_date": "date"
})

In [30]:
home_rankings.head()

Unnamed: 0,home_rank,home_team,country_abrv,total_points,previous_points,rank_change,confederation,date
0,140.0,Brunei Darussalam,BRU,2.0,0.0,140,AFC,1992-12-31
1,33.0,Portugal,POR,38.0,0.0,33,UEFA,1992-12-31
2,32.0,Zambia,ZAM,38.0,0.0,32,CAF,1992-12-31
3,31.0,Greece,GRE,38.0,0.0,31,UEFA,1992-12-31
4,30.0,Algeria,ALG,39.0,0.0,30,CAF,1992-12-31


In [31]:
away_rankings.head()

Unnamed: 0,away_rank,away_team,country_abrv,total_points,previous_points,rank_change,confederation,date
0,140.0,Brunei Darussalam,BRU,2.0,0.0,140,AFC,1992-12-31
1,33.0,Portugal,POR,38.0,0.0,33,UEFA,1992-12-31
2,32.0,Zambia,ZAM,38.0,0.0,32,CAF,1992-12-31
3,31.0,Greece,GRE,38.0,0.0,31,UEFA,1992-12-31
4,30.0,Algeria,ALG,39.0,0.0,30,CAF,1992-12-31


### Sort by date and merge datasets

In [32]:
results = results.sort_values("date")
home_rankings = home_rankings.sort_values("date")
away_rankings = away_rankings.sort_values("date")

results = pd.merge_asof(results, home_rankings[["date", "home_team", "home_rank"]],
                        on="date", by="home_team", direction="backward")

results = pd.merge_asof(results, away_rankings[["date", "away_team", "away_rank"]],
                        on="date", by="away_team", direction="backward")

In [35]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,match_result,home_rank,away_rank
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,,
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,home_win,,
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,home_win,,
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,,
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,home_win,,


### Remove all instances before ranking to prevent biases

In [36]:
results = results[results["date"] >= pd.to_datetime("1992-12-31")].copy()

In [39]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,match_result,home_rank,away_rank,rank_difference
18723,1993-01-01,Ghana,Mali,1,1,Friendly,Libreville,Gabon,True,draw,39.0,69.0,30.0
18724,1993-01-02,Gabon,Burkina Faso,1,1,Friendly,Libreville,Gabon,False,draw,55.0,97.0,42.0
18725,1993-01-02,Kuwait,Lebanon,2,0,Friendly,Kuwait City,Kuwait,False,home_win,71.0,161.0,90.0
18726,1993-01-03,Gabon,Ghana,2,3,Friendly,Libreville,Gabon,False,away_win,55.0,39.0,-16.0
18727,1993-01-03,Burkina Faso,Mali,1,0,Friendly,Libreville,Gabon,True,home_win,97.0,69.0,-28.0


In [38]:
results["rank_difference"] = results["away_rank"] - results["home_rank"]

### Selecting Features again

In [40]:
features = results[["home_rank", "away_rank", "rank_difference"]]
target = results["match_result"]

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [42]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [43]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    away_win       0.41      0.36      0.38      1658
        draw       0.26      0.19      0.22      1392
    home_win       0.57      0.69      0.62      2847

    accuracy                           0.48      5897
   macro avg       0.41      0.41      0.41      5897
weighted avg       0.45      0.48      0.46      5897

