In [106]:
# !pip install soccerdata

## Import the Libraries

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from pandas import DataFrame
warnings.filterwarnings('ignore') # Ignore warnings from imblearn for cleaner output

In [108]:
df = pd.read_csv("/content/Premier_league_data_2021_to_2025.csv")

In [109]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,home_team,away_team,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,...,away_fouls,home_corners,away_corners,home_yellow,away_yellow,home_red,away_red,home_ht_goals,away_ht_goals,time
0,0,2020-09-12,Fulham,Arsenal,0,3,5,13,2,6,...,12,2,3,2,2,0,0,0,1,12:30:00
1,1,2020-09-12,Crystal Palace,Southampton,1,0,5,9,3,5,...,11,7,3,2,1,0,0,1,0,15:00:00
2,2,2020-09-12,Liverpool,Leeds,4,3,22,6,6,3,...,6,9,0,1,0,0,0,3,2,17:30:00
3,3,2020-09-12,West Ham,Newcastle,0,2,15,15,3,2,...,7,8,7,2,2,0,0,0,0,20:00:00
4,4,2020-09-13,West Brom,Leicester,0,3,7,13,1,7,...,9,2,5,1,1,0,0,0,0,14:00:00


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1680 entries, 0 to 1679
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            1680 non-null   int64 
 1   date                  1680 non-null   object
 2   home_team             1680 non-null   object
 3   away_team             1680 non-null   object
 4   home_goals            1680 non-null   int64 
 5   away_goals            1680 non-null   int64 
 6   home_shots            1680 non-null   int64 
 7   away_shots            1680 non-null   int64 
 8   home_shots_on_target  1680 non-null   int64 
 9   away_shots_on_target  1680 non-null   int64 
 10  home_fouls            1680 non-null   int64 
 11  away_fouls            1680 non-null   int64 
 12  home_corners          1680 non-null   int64 
 13  away_corners          1680 non-null   int64 
 14  home_yellow           1680 non-null   int64 
 15  away_yellow           1680 non-null   

In [111]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
date,0
home_team,0
away_team,0
home_goals,0
away_goals,0
home_shots,0
away_shots,0
home_shots_on_target,0
away_shots_on_target,0


## **Feature Enginering**

In [112]:
# Create match result column
df["result"] = df.apply(
    lambda row: "Win" if row["home_goals"] > row["away_goals"]
    else "Draw" if row["home_goals"] == row["away_goals"]
    else "Lose",
    axis=1
  )

# Extract the starting hour of the by using the time column
df["hour"] = df["time"].str.replace(":.+", "", regex=True).astype("int")


# Add new column of goals conceded by home and away team
df["away_team_goals_conceded"] = df['home_goals']
df['home_team_goals_conceded']  = df["away_goals"]

In [113]:
df = df[[
    # Just for information purpose not need in training
    "date",
    "home_team",
    "away_team",
    # Columsn for training purposes
    "hour",
    "home_goals",
    "away_goals",
    "home_shots_on_target",
    "away_shots_on_target",
    'home_team_goals_conceded',
    'away_team_goals_conceded',
    # # "home_ht_goals",
    # # "away_ht_goals"	,
    'home_shots',
    'away_shots',
    # Target Column
    'result'
]]

### Apply Rolling on data

In [114]:
cols = [
    "home_shots_on_target",
    "away_shots_on_target",
    "home_shots",
    "away_shots",
    "home_team_goals_conceded",
    "away_team_goals_conceded",
    "home_goals",
    "away_goals"
]

new_cols = [f"{c}_avg_last5" for c in cols]

# Redefine the rolling_averages function with corrections
def rolling_averages(group, cols, new_cols):
  group = group.sort_values("date") # Corrected typo: sort_vales -> sort_values, and "Date" -> "date"
  rolling_stats = group[cols].rolling(5, closed = 'left').mean() # Corrected typo: 'letf' -> 'left'
  group[new_cols] = rolling_stats # Assign the rolling stats to the new columns
  group = group.dropna(subset = new_cols)
  return group

df = df.groupby("home_team", group_keys = False).apply(lambda x: rolling_averages(x, cols, new_cols))

### Add a new Column Points

In [115]:
home  = df[["date", "home_team", "home_goals", "away_goals"]].rename(
    columns = {"home_team": "team",
               "home_goals": "goals_scored",
               "away_goals": "goals_conceded"
               }
)

away = df[["date", "away_team", "home_goals", "away_goals"]].rename(
    columns = {"away_team" : "team",
               "home_goals": "goals_conceded",
               "away_goals": "goals_scored"
    }
)


team_matches = pd.concat([home, away],ignore_index=True)
team_matches.sort_values(["date","team"], inplace = True)

In [116]:
team_matches['point'] = np.where(
    team_matches['goals_scored'] > team_matches['goals_conceded'], 3,
    np.where(team_matches['goals_scored'] == team_matches['goals_conceded'], 1, 0)
)

In [117]:
team_matches["points_last5_matches"] = (
    team_matches.groupby("team")["point"].rolling(5).sum().reset_index(0, drop = True)
)

In [118]:
team_matches.dropna(subset=['points_last5_matches'], inplace = True)

In [119]:
df = df.merge(
    team_matches[["date", "team", "points_last5_matches"]],
    left_on=["date", "home_team"],
    right_on=["date", "team"],
    how="left"
).rename(columns={
    "points_last5_matches": "home_points_last5_matches"
}).drop(columns="team")

df = df.merge(
    team_matches[["date", "team", "points_last5_matches"]],
    left_on=["date", "away_team"],
    right_on=["date", "team"],
    how="left"
).rename(columns={
    "points_last5_matches": "away_points_last5_matches"
}).drop(columns="team")

df.dropna(subset = ["home_points_last5_matches",	"away_points_last5_matches"], inplace = True)

In [120]:
# Drop the Extra columns
df_final = df.drop(columns = cols)

In [121]:
df_final.head()

Unnamed: 0,date,home_team,away_team,hour,result,home_shots_on_target_avg_last5,away_shots_on_target_avg_last5,home_shots_avg_last5,away_shots_avg_last5,home_team_goals_conceded_avg_last5,away_team_goals_conceded_avg_last5,home_goals_avg_last5,away_goals_avg_last5,home_points_last5_matches,away_points_last5_matches
2,2020-12-26,Arsenal,Chelsea,17,Win,3.6,3.6,13.0,11.0,1.6,0.4,0.4,1.6,4.0,6.0
3,2021-01-14,Arsenal,Crystal Palace,20,Draw,4.2,3.8,13.6,13.6,1.6,1.0,1.0,1.6,10.0,5.0
4,2021-01-18,Arsenal,Newcastle,20,Win,4.6,3.0,13.2,13.0,1.0,1.0,1.0,1.0,13.0,1.0
5,2021-01-30,Arsenal,Man United,17,Draw,5.4,2.2,14.6,11.6,0.6,1.4,1.4,0.6,11.0,8.0
6,2021-02-14,Arsenal,Leeds,16,Win,4.8,2.4,14.4,12.4,0.4,1.4,1.4,0.4,7.0,9.0


In [122]:
# Add new cols
df_final["points_diff_last5"] = (
    df_final["home_points_last5_matches"] -
    df_final["away_points_last5_matches"]
)

df_final["goal_diff_avg5"] = (
    df_final["home_goals_avg_last5"] -
    df_final["away_goals_avg_last5"]
)

df_final["shots_diff_avg5"] = (
    df_final["home_shots_avg_last5"] -
    df_final["away_shots_avg_last5"]
)

df_final["shots_on_target_diff_avg5"] = (
    df_final["home_shots_on_target_avg_last5"] -
    df_final["away_shots_on_target_avg_last5"]
)

df_final["x_defense_diff"] = (
    df_final["away_team_goals_conceded_avg_last5"] -
    df_final["home_team_goals_conceded_avg_last5"]
)

df_final["home_advantage"] = 1

In [123]:
df_final.sort_values('date', inplace=True)
df_final.reset_index(inplace=True, drop = True)

In [124]:
df_final.shape

(1484, 21)

## Train_Test Split

In [125]:
input_features = [
       'home_shots_on_target_avg_last5',
       'away_shots_on_target_avg_last5',
       'home_shots_avg_last5',
       'away_shots_avg_last5',
       'home_team_goals_conceded_avg_last5',
       'away_team_goals_conceded_avg_last5',
       'home_goals_avg_last5',
       'away_goals_avg_last5',
       'home_points_last5_matches',
       'away_points_last5_matches',
       'points_diff_last5',
       'goal_diff_avg5',
       'shots_diff_avg5',
       'x_defense_diff',
       'home_advantage',
       'shots_on_target_diff_avg5']

**Note**
Train data on a Past matches and get a prediction on a New /Future Matches

In [126]:
split_date = '2025-12-10'

train = df_final[df_final['date'] < split_date]
test = df_final[df_final['date'] >= split_date]

X_train = train[input_features]
y_train = train['result']

X_test = test[input_features]
y_test = test['result']

## Model Training and Prediction

In [127]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.metrics import accuracy_score

base_estimator = DecisionTreeClassifier(
    max_depth=2,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

model = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 70.0


In [128]:
import pickle

# Save the trained model to a file
filename = 'adaboost_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model successfully saved to {filename}")

Model successfully saved to adaboost_model.pkl


### Make a one row ready for prediction

In [129]:
import pandas as pd
import numpy as np
import soccerdata as sd


class EPLMatchPredictorPreprocessor:

    def __init__(self, seasons, league="ENG-Premier League"):
        self.seasons = seasons
        self.league = league
        self.df = None

        self.input_features = [
            'home_shots_on_target_avg_last5',
            'away_shots_on_target_avg_last5',
            'home_shots_avg_last5',
            'away_shots_avg_last5',
            'home_team_goals_conceded_avg_last5',
            'away_team_goals_conceded_avg_last5',
            'home_goals_avg_last5',
            'away_goals_avg_last5',
            'home_points_last5_matches',
            'away_points_last5_matches',
            'points_diff_last5',
            'goal_diff_avg5',
            'shots_diff_avg5',
            'x_defense_diff',
            'home_advantage',
            'shots_on_target_diff_avg5'
        ]

    # -------------------------------
    # LOAD & PREPARE DATA
    # -------------------------------
    def load_data(self):
        dfs = []
        for season in self.seasons:
            mh = sd.MatchHistory(leagues=self.league, seasons=season)
            dfs.append(mh.read_games())

        df = pd.concat(dfs, ignore_index=True)

        df = df[[
            "date", "home_team", "away_team",
            "FTHG", "FTAG",
            "HS", "AS", "HST", "AST"
        ]].copy()

        df.rename(columns={
            "FTHG": "home_goals",
            "FTAG": "away_goals",
            "HS": "home_shots",
            "AS": "away_shots",
            "HST": "home_shots_on_target",
            "AST": "away_shots_on_target",
        }, inplace=True)

        df["date"] = pd.to_datetime(df["date"])
        df = df.sort_values("date").reset_index(drop=True)

        self.df = df

    # -------------------------------
    # LAST 5 MATCHES (HOME + AWAY)
    # -------------------------------
    def _get_last5_matches(self, team, before_date):
        df = self.df[
            ((self.df["home_team"] == team) | (self.df["away_team"] == team)) &
            (self.df["date"] < before_date)
        ].sort_values("date").tail(5)

        if len(df) < 5:
            raise ValueError(f"Not enough history for {team}")

        return df

    # -------------------------------
    # TEAM FORM (MEANS)
    # -------------------------------
    def _compute_team_form(self, matches, team):
        goals, shots, shots_ot, conceded = [], [], [], []

        for _, r in matches.iterrows():
            if r["home_team"] == team:
                goals.append(r["home_goals"])
                shots.append(r["home_shots"])
                shots_ot.append(r["home_shots_on_target"])
                conceded.append(r["away_goals"])
            else:
                goals.append(r["away_goals"])
                shots.append(r["away_shots"])
                shots_ot.append(r["away_shots_on_target"])
                conceded.append(r["home_goals"])

        return {
            "goals_avg": np.mean(goals),
            "shots_avg": np.mean(shots),
            "shots_ot_avg": np.mean(shots_ot),
            "conceded_avg": np.mean(conceded)
        }

    # -------------------------------
    # POINTS FROM LAST 5 MATCHES
    # -------------------------------
    def _compute_points_last5(self, matches, team):
        points = 0
        for _, r in matches.iterrows():
            if r["home_team"] == team:
                gf, ga = r["home_goals"], r["away_goals"]
            else:
                gf, ga = r["away_goals"], r["home_goals"]

            if gf > ga:
                points += 3
            elif gf == ga:
                points += 1

        return points

    # -------------------------------
    # BUILD ONE ROW FOR PREDICTION
    # -------------------------------
    def make_prediction_row(self, home_team, away_team, match_date, match_hour):
        match_date = pd.to_datetime(match_date)

        home_last5 = self._get_last5_matches(home_team, match_date)
        away_last5 = self._get_last5_matches(away_team, match_date)

        home_form = self._compute_team_form(home_last5, home_team)
        away_form = self._compute_team_form(away_last5, away_team)

        home_points = self._compute_points_last5(home_last5, home_team)
        away_points = self._compute_points_last5(away_last5, away_team)

        row = {
            "home_shots_on_target_avg_last5": home_form["shots_ot_avg"],
            "away_shots_on_target_avg_last5": away_form["shots_ot_avg"],

            "home_shots_avg_last5": home_form["shots_avg"],
            "away_shots_avg_last5": away_form["shots_avg"],

            "home_team_goals_conceded_avg_last5": home_form["conceded_avg"],
            "away_team_goals_conceded_avg_last5": away_form["conceded_avg"],

            "home_goals_avg_last5": home_form["goals_avg"],
            "away_goals_avg_last5": away_form["goals_avg"],

            "home_points_last5_matches": home_points,
            "away_points_last5_matches": away_points,

            "points_diff_last5": home_points - away_points,
            "goal_diff_avg5": home_form["goals_avg"] - away_form["goals_avg"],
            "shots_diff_avg5": home_form["shots_avg"] - away_form["shots_avg"],
            "shots_on_target_diff_avg5": home_form["shots_ot_avg"] - away_form["shots_ot_avg"],
            "x_defense_diff": away_form["conceded_avg"] - home_form["conceded_avg"],

            "home_advantage": 1
        }

        return pd.DataFrame([row])[self.input_features]

In [141]:
# Create object
prep = EPLMatchPredictorPreprocessor(
    seasons=[str(year) for year in range(2021, 2026)] # Load data from 2021 to 2025 for historical context
)

# Load data
prep.load_data()

# Build one row for prediction
one_row = prep.make_prediction_row(
    home_team="Nott'm Forest",
    away_team="Man City",
    match_date="2025-12-27",
    match_hour=8
)

In [142]:
one_row

Unnamed: 0,home_shots_on_target_avg_last5,away_shots_on_target_avg_last5,home_shots_avg_last5,away_shots_avg_last5,home_team_goals_conceded_avg_last5,away_team_goals_conceded_avg_last5,home_goals_avg_last5,away_goals_avg_last5,home_points_last5_matches,away_points_last5_matches,points_diff_last5,goal_diff_avg5,shots_diff_avg5,x_defense_diff,home_advantage,shots_on_target_diff_avg5
0,4.4,6.6,13.2,14.2,1.0,1.2,1.4,3.4,9,15,-6,-2.0,-1.0,0.2,1,-2.2


In [143]:
import pickle

# Load the trained model
filename = 'adaboost_model.pkl'
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Get probability predictions
probabilities = loaded_model.predict_proba(one_row)

# Display the probabilities
print("Prediction probabilities:")
for i, prob in enumerate(probabilities[0]):
    print(f"  {loaded_model.classes_[i]}: {prob:.4f}")

Prediction probabilities:
  Draw: 0.3072
  Lose: 0.4097
  Win: 0.2831
