In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506',
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Define the columns for newer and older seasons
new_season_cols = ['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score', 'Away_Score',
                   'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Yellow_Cards', 'Away_Red_Cards',
                   'Team', 'Home_Away', 'Gls', 'PK', 'PKatt', 'CrdY', 'CrdR']

old_season_cols = new_season_cols + ['Fls', 'Fld', 'Off']

# Define seasons with different columns
new_seasons = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
summ = {
    t: pd.read_excel(os.path.join(r'data\FBREF\summary', f"sum{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=new_season_cols if t in new_seasons else old_season_cols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in time
}

# Concatenate all dataframes
summary = pd.concat(summ.values(), ignore_index=True)

In [3]:
# Define the columns for newer and older seasons
posscols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'Touches_Touches', 'DefPen_Touches']

t2 = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
poss = {
    t: pd.read_excel(os.path.join(r'data\FBREF\possession', f"poss{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=posscols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
possession = pd.concat(poss.values(), ignore_index=True)

In [4]:
# Define the columns for newer and older seasons
misccols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'PKwon', 'PKcon']

# Loop through seasons and process the files
misc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\misc', f"dfmisc{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=misccols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
misc = pd.concat(misc.values(), ignore_index=True)

In [5]:
# Define the columns for newer and older seasons
pascols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'KP', 'Final_Third', 'PPA']

# Loop through seasons and process the files
passc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\passing', f"dfpassing{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=pascols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
passing = pd.concat(passc.values(), ignore_index=True)

In [6]:
mergin_vars = ['Matchweek', 'Season', 'Partido', 'Home_Team', 'Away_Team', 'Team', 'Home_Away']

DataPMatchFBREF = pd.merge(summary, possession, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, misc, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, passing, on = mergin_vars, how = 'outer')

In [7]:
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

DataPMatchFBREF['Year'] = DataPMatchFBREF['Season'].map(season_to_year)


In [8]:
DataPMatchFBREF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18240 entries, 0 to 18239
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              17482 non-null  float64
 1   Matchweek          18240 non-null  object 
 2   Home_Team          18240 non-null  object 
 3   Away_Team          18240 non-null  object 
 4   Home_Score         18240 non-null  int64  
 5   Away_Score         18240 non-null  int64  
 6   Home_Yellow_Cards  18240 non-null  int64  
 7   Home_Red_Cards     18240 non-null  int64  
 8   Away_Yellow_Cards  18240 non-null  int64  
 9   Away_Red_Cards     18240 non-null  int64  
 10  Team               18240 non-null  object 
 11  Home_Away          18240 non-null  object 
 12  Gls                18240 non-null  int64  
 13  PK                 18240 non-null  int64  
 14  PKatt              6080 non-null   float64
 15  CrdY               18240 non-null  int64  
 16  CrdR               182

In [9]:
DataPMatchFBREF.keys()

Index(['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score',
       'Away_Score', 'Home_Yellow_Cards', 'Home_Red_Cards',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'Team', 'Home_Away', 'Gls', 'PK',
       'PKatt', 'CrdY', 'CrdR', 'Fls', 'Fld', 'Off', 'Season', 'Partido',
       'Touches_Touches', 'DefPen_Touches', 'PKwon', 'PKcon', 'KP',
       'Final_Third', 'PPA', 'Year'],
      dtype='object')

In [10]:
varsaux = ['Gls', 'PK', 'PKatt', 'Fls', 'Fld', 'Off', 'Touches_Touches', 'DefPen_Touches', 
           'PKwon', 'PKcon', 'KP', 'Final_Third', 'PPA']

for vary in varsaux:
    for locvi in ['Home', 'Away']:  # Only two unique values in 'Home_Away'
        var = locvi + vary
        DataPMatchFBREF[var] = np.where(DataPMatchFBREF['Home_Away'] == locvi, 
                                        DataPMatchFBREF[vary], 0)
        
        if locvi == 'Away':
            DataPMatchFBREF[var] = DataPMatchFBREF[var].shift(periods=-1)  # Shift for 'Away' only


In [11]:
DataPMatchFBREF.to_excel('DataPMatchFBREF.xlsx')

NameError: name 'xxx' is not defined

In [None]:
DataPMatchFBREF.to_excel('DataPMatchFBREF.xlsx')

In [None]:
DataPMatchFBREF.keys()

In [None]:
xxxxxxxxx
xxx
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Leganés' : 'Leganés', 
        'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'Córdoba CF' : 'Córdoba', 
        'Deportivo Alavés' : 'Alavés', 'Deportivo de La Coruña' : 'La Coruña', 
        'Elche CF' : 'Elche', 'FC Barcelona' : 'Barcelona', 
        'Getafe CF' : 'Getafe', 'Gimnàstic de Tarragona' : 'Gimnàstic', 
        'Girona FC' : 'Girona', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 
        'Málaga CF' : 'Málaga', 'Racing Santander' : 'Racing Sant', 
        'RCD Espanyol Barcelona' : 'Espanyol', 'RCD Mallorca' : 'Mallorca', 
        'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 'Real Murcia', 
        'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'SD Huesca' : 'Huesca', 
        'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'Valencia CF' : 'Valencia', 
        'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 'Villarreal', 
        'Xerez CD' : 'Xerez'
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)

In [None]:
def standard_teams(df):
    replacements = {
        'Ã¡' : 'á', 'Ã ' : 'à', 'Ã©' : 'é', 'Ã­' : 'í', 
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Girona FC' : 'Girona', 
        'SD Huesca' : 'Huesca', 
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)


In [None]:
def standard_teams(df):
    replacements = {        
        ', ' : '', 'â‚¬ ' : '', '£' : '', 
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)


In [None]:
def standard_teams(df):
    replacements = {
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)
