In [1]:
# load from csv to pandas dataframe
import pandas as pd

# load data
matches_df = pd.read_csv('fbref_data.csv')
matches_df.shape

(1456, 27)

In [2]:
# converts date column to datetime
matches_df["date"] = pd.to_datetime(matches_df["date"])

In [3]:
matches_df["venue_code"] = matches_df["venue"].astype('category').cat.codes

In [4]:
matches_df["opp_code"] = matches_df["opponent"].astype('category').cat.codes

In [5]:
matches_df["hour"] = matches_df["time"].str.replace(':.+', '', regex=True).astype(int)

In [6]:
matches_df["day_code"] = matches_df["date"].dt.dayofweek

In [7]:
matches_df["target"] = (matches_df["result"] == "W").astype(int)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

# useful for non-linear tendencies
rf = RandomForestClassifier(
    n_estimators=50,
    min_samples_split=10,
    random_state=1
)

predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [9]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

grouped_matches = matches_df.groupby("team")
group = grouped_matches.get_group("Manchester City")

In [10]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


In [11]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [12]:
matches_rollings = matches_df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rollings = matches_rollings.droplevel('team')
matches_rollings.index =  range(matches_rollings.shape[0])

  matches_rollings = matches_df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [13]:
def make_predictions(data, predictors):
    training_data = data[data["date"] < "2023-01-01"]
    test_data = data[data["date"] > "2023-01-01"]
    rf.fit(training_data[predictors], training_data["target"])
    preds = rf.predict(test_data[predictors])
    combined = pd.DataFrame(dict(actual=test_data["target"], prediction=preds))
    precision = precision_score(test_data["target"], preds)
    return combined, precision

In [14]:
combined, precision = make_predictions(matches_rollings, predictors)

In [15]:
precision

0.5033333333333333

In [16]:
combined = combined.merge(matches_rollings[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
13,0,0,2023-01-03,Arsenal,Newcastle Utd,D
14,1,1,2023-01-15,Arsenal,Tottenham,W
15,1,0,2023-01-22,Arsenal,Manchester Utd,W
16,0,0,2023-02-04,Arsenal,Everton,L
17,0,0,2023-02-11,Arsenal,Brentford,D
...,...,...,...,...,...,...
1382,0,1,2024-04-06,Wolverhampton Wanderers,West Ham,L
1383,0,1,2024-04-13,Wolverhampton Wanderers,Nott'ham Forest,D
1384,0,0,2024-04-20,Wolverhampton Wanderers,Arsenal,L
1385,0,1,2024-04-24,Wolverhampton Wanderers,Bournemouth,L


In [17]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)
mapping["Brighton & Hove Albion"]

'Brighton'

In [18]:
combined["new_team"] = combined["team"].map(mapping)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result,new_team
13,0,0,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal
14,1,1,2023-01-15,Arsenal,Tottenham,W,Arsenal
15,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal
16,0,0,2023-02-04,Arsenal,Everton,L,Arsenal
17,0,0,2023-02-11,Arsenal,Brentford,D,Arsenal
...,...,...,...,...,...,...,...
1382,0,1,2024-04-06,Wolverhampton Wanderers,West Ham,L,Wolves
1383,0,1,2024-04-13,Wolverhampton Wanderers,Nott'ham Forest,D,Wolves
1384,0,0,2024-04-20,Wolverhampton Wanderers,Arsenal,L,Wolves
1385,0,1,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves


In [19]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,1,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
2,1,0,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,0,Manchester United,Arsenal,L,Manchester Utd
3,0,0,2023-02-04,Arsenal,Everton,L,Arsenal,1,0,Everton,Arsenal,W,Everton
4,0,0,2023-02-11,Arsenal,Brentford,D,Arsenal,0,0,Brentford,Arsenal,D,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
959,0,1,2024-04-06,Wolverhampton Wanderers,West Ham,L,Wolves,1,1,West Ham United,Wolves,W,West Ham
960,0,1,2024-04-13,Wolverhampton Wanderers,Nott'ham Forest,D,Wolves,0,1,Nottingham Forest,Wolves,D,Nottingham Forest
961,0,0,2024-04-20,Wolverhampton Wanderers,Arsenal,L,Wolves,1,1,Arsenal,Wolves,W,Arsenal
962,0,1,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth


In [20]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()

actual_x
1    114
0    113
Name: count, dtype: int64

In [21]:
114 + 113

114 /227

0.5022026431718062