In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# load in data from scrapers
matches = pd.read_csv("matches.csv", index_col=0)
standings = pd.read_csv("standings_2014_15_to_2023_24.csv")


# map missing team names
class MissingDict(dict):
    def __missing__(self, key):
        return key


map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}
mapping = MissingDict(**map_values)
matches["team"] = matches["team"].map(mapping)

# calculate the previous season
matches["prev_season"] = matches["season"] - 1


# merge the two dfs
merged_df = matches.merge(
    standings.rename(columns={"season": "prev_season", "rank": "prev_rank"}),
    on=["prev_season", "team"],
    how="left",
)

# now merge for opp rank as well
merged_df = merged_df.merge(
    standings.rename(
        columns={"season": "prev_season", "team": "opponent", "rank": "opp_prev_rank"}
    ),
    on=["prev_season", "opponent"],
    how="left",
)

# convert ranks back to integers, replace NaN with 20
merged_df["promote"] = merged_df["prev_rank"].isnull().astype(int)
merged_df["prev_rank"] = merged_df["prev_rank"].fillna(-1).astype(int)
merged_df["prev_rank"] = merged_df["prev_rank"].replace(-1, 20)
merged_df["opp_prev_rank"] = merged_df["opp_prev_rank"].fillna(-1).astype(int)
merged_df["opp_prev_rank"] = merged_df["opp_prev_rank"].replace(-1, 20)

# clean up data for ML model, conversion to int, etc
merged_df["date"] = pd.to_datetime(merged_df["date"])
merged_df["venue_code"] = merged_df["venue"].astype("category").cat.codes
# matches["team_code"] = matches["team"].astype("category").cat.codes
merged_df["opp_code"] = merged_df["opponent"].astype("category").cat.codes
merged_df["hour"] = merged_df["time"].str.replace(":.*", "", regex=True).astype("int")
merged_df["day_code"] = merged_df["date"].dt.dayofweek

# W = 2, L = 1, D = 0
merged_df["target"] = merged_df["result"].astype("category").cat.codes

# rank difference
merged_df["rank_diff"] = merged_df["prev_rank"] - merged_df["opp_prev_rank"]


In [3]:
merged_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'opp formation', 'referee', 'match report', 'notes', 'sh', 'sot',
       'dist', 'fk', 'pk', 'pkatt', 'season', 'team', 'prev_season',
       'prev_rank', 'opp_prev_rank', 'promote', 'venue_code', 'opp_code',
       'hour', 'day_code', 'target', 'rank_diff'],
      dtype='object')