Here we will connect 2 types of dfs:
1. The matches db that includes the __names__ of the players
2. The player ratings db that inclueds also the __names__ of the players

We will connect via the closest string for each player in a match in order to get:
1. player rating
2. player age
3. player height

In [106]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

First we need to fix the club name issue, for exmaple Man utd in the matches db is not the same as Manchester United

In [107]:
def add_form(processed_csv, standings_csv):

    season = pd.read_csv(processed_csv)
    standings = pd.read_csv(standings_csv)
    standings.set_index("Team", inplace=True)
    names_fix = {'Man Utd': "Manchester United",
             "Man City": 'Manchester City',
             "West Ham": "West Ham United",
             "Nott'm Forest": "Nottingham Forest",
             'Spurs': "Tottenham Hotspur",
             'Wolves': "Wolverhampton Wanderers",
             "Brighton & Hove Albion": "Brighton and Hove Albion",
             'Newcastle': 'Newcastle United',
             'Leicester': 'Leicester City',
             'Leeds': "Leeds United",
             'Huddersfield' : 'Huddersfield Town',
             'Swansea' : 'Swansea City',
             'Cardiff': 'Cardiff City',
             'Norwich' : 'Norwich City',
             'Stoke' : 'Stoke City',
             'West Brom' : 'West Bromwich Albion',
             'Hull' : 'Hull City',
             'QPR' : 'Queens Park Rangers',
             'Sheffield Utd' : 'Sheffield United',
             "AFC Bournemouth" :"Bournemouth"}
    columns = list(standings.columns)
    for index, row in season.iterrows():
        matchweek = row['Matchweek']

        if matchweek > 1:
            rank_col = columns[columns.index("Matchweek "+str(matchweek)+"_Rank")-3]
            points_col = columns[columns.index("Matchweek "+str(matchweek)+"_Rank")-2]
            # Get teams at specific ranks
            first_place_team = None
            fourth_place_team = None
            seventeenth_place_team = None
            
            for team, rank in standings[rank_col].items():
                if rank == 1:
                    first_place_team = team
                elif rank == 4:
                    fourth_place_team = team
                elif rank == 17:
                    seventeenth_place_team = team
            # print('first place:',first_place_team)
            # print('fourth:',fourth_place_team)
            # print('relegeated:',seventeenth_place_team)
                    
            home_points_prior = season.at[index, 'home_Points_prior'] if season.at[index, 'home_Points_prior'] else 0
            away_points_prior = season.at[index, 'away_Points_prior'] if season.at[index, 'away_Points_prior'] else 0
            first_place_points = standings.loc[first_place_team, points_col] if first_place_team else 0
            fourth_place_points = standings.loc[fourth_place_team, points_col] if fourth_place_team else 0
            seventeenth_place_points = standings.loc[seventeenth_place_team, points_col] if seventeenth_place_team else 0
            
            season.at[index, 'home_points_to_championship'] = first_place_points - home_points_prior/(38-matchweek+1)
            season.at[index, 'home_points_to_ucl'] = fourth_place_points - home_points_prior/(38-matchweek+1)
            season.at[index, 'home_points_to_rel'] = seventeenth_place_points - home_points_prior/(38-matchweek+1)

            season.at[index, 'away_points_to_championship'] = first_place_points - away_points_prior/(38-matchweek+1)
            season.at[index, 'away_points_to_ucl'] = fourth_place_points - away_points_prior/(38-matchweek+1)
            season.at[index, 'away_points_to_rel'] = seventeenth_place_points - away_points_prior/(38-matchweek+1)

        else:
            season.at[index, 'home_points_to_championship'] = 0
            season.at[index, 'home_points_to_ucl'] = 0
            season.at[index, 'home_points_to_rel'] = 0

            season.at[index, 'away_points_to_championship'] = 0
            season.at[index, 'away_points_to_ucl'] = 0
            season.at[index, 'away_points_to_rel'] = 0

        ## setting match importance:
            
        home_chances = np.array([season.at[index, 'home_points_to_championship'],
                                 season.at[index, 'home_points_to_ucl'],
                                 season.at[index, 'home_points_to_rel']])
        away_chances = np.array([season.at[index, 'away_points_to_championship'],
                                 season.at[index, 'away_points_to_ucl'],
                                 season.at[index, 'away_points_to_rel']])
        home_importances = np.abs(home_chances)
        away_importances = np.abs(away_chances)
        home_imp = home_chances[np.argmin(home_importances)]
        away_imp = away_chances[np.argmin(away_importances)]
        if abs(home_imp) > 3:
            home_imp = 0
        if abs(away_imp) > 3:
            away_imp = 0
        season.at[index, 'home_match_importance'] = home_imp
        season.at[index, 'away_match_importance'] = away_imp
        if columns.index("Matchweek "+str(matchweek)+"_Rank") < 17: #if we are in the first 5 matches
            season.at[index, 'home_GD_form'] = season.at[index, 'home_GD_prior']
            season.at[index, 'home_Points_form'] = season.at[index, 'home_GD_prior']
            season.at[index, 'home_GD_form_pw'] = season.at[index, 'home_GD_form'] / matchweek
            season.at[index, 'home_Points_form_pw'] = season.at[index, 'home_Points_form'] / matchweek

            season.at[index, 'away_Points_form'] = season.at[index, 'away_Points_prior']     
            season.at[index, 'away_GD_form'] = season.at[index, 'away_GD_prior']
            season.at[index, 'away_GD_form_pw'] = season.at[index, 'away_GD_form'] / matchweek
            season.at[index, 'away_Points_form_pw'] = season.at[index, 'away_Points_form'] / matchweek
            
            
            
        else:
            prior_gd_col = columns[columns.index("Matchweek "+str(matchweek)+"_GD")-18]
            prior_points_col = columns[columns.index("Matchweek "+str(matchweek)+"_Points")-18]
            home_team = season.at[index, 'home_team_name']
            away_team = season.at[index, 'away_team_name']

            if home_team in names_fix.keys():
                home_team = names_fix[home_team]
            if away_team in names_fix.keys():
                away_team = names_fix[away_team]
            #getting for home
            home_gd_prior = standings.at[home_team, prior_gd_col]
            home_points_prior = standings.at[home_team, prior_points_col]
            if(home_gd_prior == None or np.isnan(home_gd_prior)):
                home_gd_prior=0
            if(home_points_prior == None or np.isnan(home_points_prior)):
                home_points_prior = 0
            
            prior_gw = int(prior_gd_col[:len(prior_gd_col)-3].split()[1])
            #getting for away
            away_gd_prior = standings.at[away_team, prior_gd_col]
            away_points_prior = standings.at[away_team, prior_points_col]
            if away_gd_prior == None or np.isnan(away_gd_prior):
                away_gd_prior = 0
            if away_points_prior == None or np.isnan(away_points_prior):
                away_points_prior = 0


            season.at[index, 'home_GD_form'] = season.at[index, 'home_GD_prior'] - home_gd_prior
            season.at[index, 'home_Points_form'] = season.at[index, 'home_Points_prior'] - home_points_prior
            season.at[index, 'home_GD_form_pw'] = season.at[index, 'home_GD_form'] / (matchweek - prior_gw + 1)
            season.at[index, 'home_Points_form_pw'] = season.at[index, 'home_Points_form'] / (matchweek -prior_gw+1)
            season.at[index, 'away_GD_form'] = season.at[index, 'away_GD_prior'] - away_gd_prior
            season.at[index, 'away_Points_form'] = season.at[index, 'away_Points_prior'] - away_points_prior
            season.at[index, 'away_GD_form_pw'] = season.at[index, 'away_GD_form'] / (matchweek - prior_gw+1)
            season.at[index, 'away_Points_form_pw'] = season.at[index, 'away_Points_form'] / (matchweek -prior_gw+1)

    return season

In [108]:
def add_betting(df, season):
    betting = pd.read_csv("betting/"+season+"_betting.csv")
    # file_name = file_path.split('/')[-1]  # Extracts the file name from the path
    # epl_string = file_name.split('_')[0]

    names_fix = {'Man Utd': "Manchester United",
             'Man United' : "Manchester United",
             "Man City": 'Manchester City',
             "West Ham": "West Ham United",
             "Nott'm Forest": "Nottingham Forest",
             'Spurs': "Tottenham Hotspur",
             'Tottenham' : "Tottenham Hotspur",
             'Wolves': "Wolverhampton Wanderers",
             "Brighton and Hove Albion": "Brighton & Hove Albion",
              "Brighton": "Brighton & Hove Albion",  
             "Bournemouth": "AFC Bournemouth",
             'Newcastle': 'Newcastle United',
             'Leicester': 'Leicester City',
             'Leeds': "Leeds United",
             'Huddersfield' : 'Huddersfield Town',
             'Swansea' : 'Swansea City',
             'Cardiff': 'Cardiff City',
             'Norwich' : 'Norwich City',
             'Stoke' : 'Stoke City',
             'West Brom' : 'West Bromwich Albion',
             'Hull' : 'Hull City',
             'QPR' : 'Queens Park Rangers',
             'Sheffield Utd' : 'Sheffield United'}
    
    for key, val in names_fix.items():
        betting['HomeTeam'].replace(key, val, inplace=True)
        betting['AwayTeam'].replace(key, val, inplace=True)
    print('replaced all the names')
    for index, row in df.iterrows():
        # add the AvgA, AvgD, AvgH for home_team_name == HomeTeam and away_team_name === AwayTeam
        home_team_name = row['home_team_name']
        away_team_name = row['away_team_name']
        # Filter betting DataFrame to find corresponding rows
        if (not home_team_name in betting['HomeTeam'].values):
            print("Fix the names:",home_team_name)
        if (not away_team_name  in betting['AwayTeam'].values):
            print("Fix the names:",away_team_name)
        matching_row = betting[(betting['HomeTeam'] == home_team_name) & (betting['AwayTeam'] == away_team_name)]
        matching_row = matching_row.iloc[0]

        print(matching_row)
        df.at[index, 'B365A'] = matching_row['B365A']
        df.at[index, 'B365D'] = matching_row['B365D']
        df.at[index, 'B365H'] = matching_row['B365H']

In [109]:
df = add_form('processed Datasets/MatchDB/epl2223_proccessed.csv', 'standings/standings_epl2223.csv')

In [110]:
def add_forms_pipeline(seasons, standings_list):
    for i in range(len(seasons)):
        file_name = seasons[i].split('/')[-1]  # Extracts the file name from the path
        epl_string = file_name.split('_')[0]
        new_season = add_form (seasons[i], standings_list[i])
        add_betting(new_season, epl_string)
        new_name = seasons[i].split('.')[0]+'_with_form.csv'
        new_season.to_csv(new_name)


In [111]:
add_forms_pipeline(['processed Datasets/MatchDB/epl1516_proccessed.csv', 
                    'processed Datasets/MatchDB/epl2122_proccessed.csv',
          'processed Datasets/MatchDB/epl1819_proccessed.csv', 'processed Datasets/MatchDB/epl1617_proccessed.csv',
          'processed Datasets/MatchDB/epl1718_proccessed.csv', 'processed Datasets/MatchDB/epl1415_proccessed.csv',
            'processed Datasets/MatchDB/epl1920_proccessed.csv', 'processed Datasets/MatchDB/epl2021_proccessed.csv',
              'processed Datasets/MatchDB/epl2223_proccessed.csv'],
          ['standings/standings_epl1516.csv','standings/standings_epl2122.csv',
           'standings/standings_epl1819.csv','standings/standings_epl1617.csv',
           'standings/standings_epl1718.csv','standings/standings_epl1415.csv',
           'standings/standings_epl1920.csv','standings/standings_epl2021.csv',
           'standings/standings_epl2223.csv'])

replaced all the names
Div                        E0
Date               17/05/2016
HomeTeam    Manchester United
AwayTeam      AFC Bournemouth
FTHG                        3
                  ...        
BbMxAHA                  2.08
BbAvAHA                  2.02
PSCH                      1.5
PSCD                      4.6
PSCA                     7.03
Name: 379, Length: 65, dtype: object
Div                  E0
Date         15/05/2016
HomeTeam        Arsenal
AwayTeam    Aston Villa
FTHG                  4
               ...     
BbMxAHA            1.99
BbAvAHA            1.94
PSCH               1.12
PSCD              12.03
PSCA               23.6
Name: 370, Length: 65, dtype: object
Div                     E0
Date            15/05/2016
HomeTeam           Chelsea
AwayTeam    Leicester City
FTHG                     1
                 ...      
BbMxAHA               1.92
BbAvAHA               1.87
PSCH                  2.01
PSCD                  3.99
PSCA                  3.68
Name: 371, L