In [128]:
import pandas as pd
import numpy as np

In [129]:
pd.set_option('display.max_columns', None)
#pd.reset_option('display.max_columns')

In [130]:
season09=pd.read_csv('data/season-0910.csv')
season10=pd.read_csv('data/season-1011.csv')
season11=pd.read_csv('data/season-1112.csv')
season12=pd.read_csv('data/season-1213.csv')
season13=pd.read_csv('data/season-1314.csv')
season14=pd.read_csv('data/season-1415.csv')
season15=pd.read_csv('data/season-1516.csv')
season16=pd.read_csv('data/season-1617.csv')
season17=pd.read_csv('data/season-1718.csv')

I will prepare features by each table separately. Teams change each year and I want to have data grouped by team. I will also want to have about 5 matches of data for each team to get some running aggregations

In [131]:
seasons=[season09,season10,season11,season12,season13,season14,season15,season16,season17]

In [132]:
for i, table in enumerate(seasons):
    dates=table['Date'].apply(lambda x: len(x)).unique()
    if len(dates)==1:
        if dates[0]==8:
            table['Date']=pd.to_datetime(table['Date'], format='%d/%m/%y')
            
        elif dates[0]==10:
            table['Date']=pd.to_datetime(table['Date'], format='%d/%m/%Y')
        else:
            print(f"unexpected date format, check table {i}")
    else:
         print(f"more than 1 date format, check table {i}")

In [133]:
def process_season_data(season_df, season_name):
    # One-hot encode the result field
    ftr_encoded = pd.get_dummies(season_df['FTR'], prefix='', prefix_sep='')
    
    # Adding the new one-hot encoded columns to the dataframe
    season_df = pd.concat([season_df, ftr_encoded], axis=1)
    
    season_df.rename(columns={'H': 'HomeWin', 'D': 'Draw', 'A': 'AwayWin'}, inplace=True)
    
    # Keeping most data except for away team, referee, half-time result, and away cards received by the opposite team.
    homeMatches = season_df[['Date', 'HomeTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
                             'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'HR', 'HomeWin', 'Draw', 'AwayWin']]
    
    homeMatches = homeMatches.copy()
    
    homeMatches.rename(columns={'HomeTeam': 'team',
                                'HomeWin': 'win',
                                'Draw': 'draw',
                                'AwayWin': 'loss',
                                'FTHG': 'goals',
                                'FTAG': 'conceded',
                                'FTR': 'result',
                                'HTHG': 'half_goals',
                                'HTAG': 'half_conceded',
                                'HS': 'shots',
                                'AS': 'shots_against',
                                'HST': 'shots_target',
                                'AST': 'shots_against_target',
                                'HF': 'fouls',
                                'AF': 'fouls_other_team',
                                'HC': 'corners',
                                'AC': 'corners_conceded',
                                'HY': 'yellows',
                                'HR': 'reds'}, inplace=True)

    awayMatches = season_df[['Date', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
                             'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'AY', 'AR', 'HomeWin', 'Draw', 'AwayWin']]

    awayMatches = awayMatches.copy()

    # Same as above but with all the away stats swapped with the home stats
    awayMatches.rename(columns={'AwayTeam': 'team',
                                'AwayWin': 'win',
                                'Draw': 'draw',
                                'HomeWin': 'loss',
                                'FTAG': 'goals',
                                'FTHG': 'conceded',
                                'FTR': 'result',
                                'HTAG': 'half_goals',
                                'HTHG': 'half_conceded',
                                'AS': 'shots',
                                'HS': 'shots_against',
                                'AST': 'shots_target',
                                'HST': 'shots_against_target',
                                'AF': 'fouls',
                                'HF': 'fouls_other_team',
                                'AC': 'corners',
                                'HC': 'corners_conceded',
                                'AY': 'yellows',
                                'AR': 'reds'}, inplace=True)

    awayMatches.reset_index(drop=True)

    # Combine home and away matches
    allMatches = pd.concat([homeMatches, awayMatches], axis=0).sort_values(by=['Date', 'team']).reset_index(drop=True)

    # Add match number and days since previous match
    allMatches['matchNumber'] = allMatches.groupby('team')['Date'].transform('cumcount') + 1
    allMatches['daysSince'] = allMatches.groupby('team')['Date'].transform(lambda x: x - x.shift(1))

    # Define the columns to calculate running totals for
    stats = ['win', 'draw', 'loss', 'goals', 'conceded', 'half_goals', 'half_conceded',
             'shots', 'shots_against', 'shots_target', 'shots_against_target', 'corners',
             'corners_conceded', 'fouls', 'fouls_other_team', 'yellows', 'reds']

    # Loop through the stats and calculate the running totals
    for stat in stats:
        allMatches[f'total.{stat}'] = allMatches.groupby('team')[stat].transform(
            lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
        )

    # Loop through the stats and calculate the last 3 game averages
    for stat in stats:
        allMatches[f'last3.{stat}'] = allMatches.groupby('team')[stat].transform(
            lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
        )

    # Drop the original stats columns
    allMatches.drop(columns=['goals', 'conceded', 'result', 'half_goals', 'half_conceded',
                             'shots', 'shots_against', 'shots_target', 'shots_against_target',
                             'fouls', 'fouls_other_team', 'corners', 'corners_conceded', 'yellows',
                             'reds', 'win', 'draw', 'loss'], inplace=True)

    # Merge home and away match data into season DataFrame
    season_df = season_df[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'AwayWin', 'Draw', 'HomeWin']]

    season_df = season_df.merge(
        allMatches.add_prefix('home.'), how='left', left_on=['Date', 'HomeTeam'], right_on=['home.Date', 'home.team'])

    season_df.drop(columns=['home.Date', 'home.team'], inplace=True)

    season_df = season_df.merge(
        allMatches.add_prefix('away.'), how='left', left_on=['Date', 'AwayTeam'], right_on=['away.Date', 'away.team'])

    season_df.drop(columns=['away.Date', 'away.team'], inplace=True)

    # Home v away performance by team (when home or away respectively)
    season_df['home.total.win.whenHome'] = season_df.groupby('HomeTeam')['HomeWin'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    season_df['home.total.draw.whenHome'] = season_df.groupby('HomeTeam')['Draw'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    season_df['home.total.loss.whenHome'] = season_df.groupby('HomeTeam')['AwayWin'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    season_df['away.total.win.whenAway'] = season_df.groupby('AwayTeam')['AwayWin'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    season_df['away.total.draw.whenAway'] = season_df.groupby('AwayTeam')['Draw'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    season_df['away.total.loss.whenAway'] = season_df.groupby('AwayTeam')['HomeWin'].transform(
        lambda x: x.shift(1).cumsum() / x.shift(1).expanding().count().replace(0, pd.NA)
    )

    # Adding a column to indicate the season
    season_df['season'] = season_name

    return season_df




In [134]:
# Example usage:
season09_df = process_season_data(season09, 'season09')
season10_df = process_season_data(season10, 'season10')
season11_df = process_season_data(season11, 'season11')
season12_df = process_season_data(season12, 'season12')
season13_df = process_season_data(season13, 'season13')
season14_df = process_season_data(season14, 'season14')
season15_df = process_season_data(season15, 'season15')
season16_df = process_season_data(season16, 'season16')
season17_df = process_season_data(season17, 'season17')

In [135]:
seasons_df=pd.concat([season09_df, season10_df, season11_df, season12_df, season13_df, season14_df, season15_df, season16_df, season17_df], axis=0, ignore_index=True)

In [136]:
seasons_df.drop(columns=['HomeWin' ,'Draw', 'AwayWin'], inplace=True)

In [137]:
seasons_df.to_pickle('data/seasons_HA_df.pkl')

Aggregating most of the home v away team's stats so there are less features to deal with

In [138]:
home_columns=['home.daysSince', 'home.total.win',
        'home.total.draw', 'home.total.loss', 'home.total.goals',
        'home.total.conceded', 'home.total.half_goals',
        'home.total.half_conceded', 'home.total.shots',
        'home.total.shots_against', 'home.total.shots_target',
        'home.total.shots_against_target', 'home.total.corners',
        'home.total.corners_conceded', 'home.total.fouls',
        'home.total.fouls_other_team', 'home.total.yellows', 'home.total.reds',
        'home.last3.win', 'home.last3.draw', 'home.last3.loss',
        'home.last3.goals', 'home.last3.conceded', 'home.last3.half_goals',
        'home.last3.half_conceded', 'home.last3.shots',
        'home.last3.shots_against', 'home.last3.shots_target',
        'home.last3.shots_against_target', 'home.last3.corners',
        'home.last3.corners_conceded', 'home.last3.fouls',
        'home.last3.fouls_other_team', 'home.last3.yellows', 'home.last3.reds',
        'home.total.win.whenHome', 'home.total.draw.whenHome',
        'home.total.loss.whenHome'
             ]
home_data = seasons_df[home_columns]
seasons_df.drop(columns=home_columns, inplace=True)

In [139]:
away_columns=['away.daysSince', 'away.total.win',
        'away.total.draw', 'away.total.loss', 'away.total.goals',
        'away.total.conceded', 'away.total.half_goals',
        'away.total.half_conceded', 'away.total.shots',
        'away.total.shots_against', 'away.total.shots_target',
        'away.total.shots_against_target', 'away.total.corners',
        'away.total.corners_conceded', 'away.total.fouls',
        'away.total.fouls_other_team', 'away.total.yellows', 'away.total.reds',
        'away.last3.win', 'away.last3.draw', 'away.last3.loss',
        'away.last3.goals', 'away.last3.conceded', 'away.last3.half_goals',
        'away.last3.half_conceded', 'away.last3.shots',
        'away.last3.shots_against', 'away.last3.shots_target',
        'away.last3.shots_against_target', 'away.last3.corners',
        'away.last3.corners_conceded', 'away.last3.fouls',
        'away.last3.fouls_other_team', 'away.last3.yellows', 'away.last3.reds',
        'away.total.win.whenAway', 'away.total.draw.whenAway', 'away.total.loss.whenAway'
             ]
away_data = seasons_df[away_columns]
seasons_df.drop(columns=away_columns, inplace=True)

In [140]:
new_column_names=['diff.daysSince', 'diff.total.win',
        'diff.total.draw', 'diff.total.loss', 'diff.total.goals',
        'diff.total.conceded', 'diff.total.half_goals',
        'diff.total.half_conceded', 'diff.total.shots',
        'diff.total.shots_against', 'diff.total.shots_target',
        'diff.total.shots_against_target', 'diff.total.corners',
        'diff.total.corners_conceded', 'diff.total.fouls',
        'diff.total.fouls_other_team', 'diff.total.yellows', 'diff.total.reds',
        'diff.last3.win', 'diff.last3.draw', 'diff.last3.loss',
        'diff.last3.goals', 'diff.last3.conceded', 'diff.last3.half_goals',
        'diff.last3.half_conceded', 'diff.last3.shots',
        'diff.last3.shots_against', 'diff.last3.shots_target',
        'diff.last3.shots_against_target', 'diff.last3.corners',
        'diff.last3.corners_conceded', 'diff.last3.fouls',
        'diff.last3.fouls_other_team', 'diff.last3.yellows', 'diff.last3.reds',
        'diff.total.win.whenHorA', 'diff.total.draw.whenHorA',
        'diff.total.loss.whenHorA'
             ]

In [141]:
diff_df=pd.DataFrame(home_data.values-away_data.values, columns=new_column_names)

In [143]:
seasons_df=pd.concat([seasons_df, diff_df], axis=1)

In [144]:
seasons_df['diff.daysSince']=seasons_df['diff.daysSince'].dt.days

In [145]:
seasons_df.to_pickle('data/seasons_df.pkl')