In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Summary data
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506',
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Define the columns for newer and older seasons
new_season_cols = ['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score', 'Away_Score',
                   'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Yellow_Cards', 'Away_Red_Cards',
                   'Team', 'Home_Away', 'Gls', 'PK', 'PKatt', 'CrdY', 'CrdR']

old_season_cols = new_season_cols + ['Fls', 'Fld', 'Off']

# Define seasons with different columns
new_seasons = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
summ = {
    t: pd.read_excel(os.path.join(r'data\FBREF\summary', f"sum{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=new_season_cols if t in new_seasons else old_season_cols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in time
}

# Concatenate all dataframes
summary = pd.concat(summ.values(), ignore_index=True)

In [3]:
# Possession data

# Define the columns for newer and older seasons
posscols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'Touches_Touches', 'DefPen_Touches']

t2 = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
poss = {
    t: pd.read_excel(os.path.join(r'data\FBREF\possession', f"poss{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=posscols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
possession = pd.concat(poss.values(), ignore_index=True)

In [4]:
# Miscelaneous data

# Define the columns for newer and older seasons
misccols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'PKwon', 'PKcon']

# Loop through seasons and process the files
misc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\misc', f"dfmisc{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=misccols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
misc = pd.concat(misc.values(), ignore_index=True)

In [5]:
# Passing data

# Define the columns for newer and older seasons
pascols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'KP', 'Final_Third', 'PPA']

# Loop through seasons and process the files
passc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\passing', f"dfpassing{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=pascols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
passing = pd.concat(passc.values(), ignore_index=True)

In [6]:
# Mergin data
mergin_vars = ['Matchweek', 'Season', 'Partido', 'Home_Team', 'Away_Team', 'Team', 'Home_Away']

DataPMatchFBREF = pd.merge(summary, possession, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, misc, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, passing, on = mergin_vars, how = 'outer')

In [7]:
# Define Year variable
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

DataPMatchFBREF['Year'] = DataPMatchFBREF['Season'].map(season_to_year)

In [8]:
DataPMatchFBREF.reset_index()

Unnamed: 0,level_0,index,Matchweek,Home_Team,Away_Team,Home_Score,Away_Score,Home_Yellow_Cards,Home_Red_Cards,Away_Yellow_Cards,...,Season,Partido,Touches_Touches,DefPen_Touches,PKwon,PKcon,KP,Final_Third,PPA,Year
0,0,1.0,La Liga (Matchweek 1),Valencia,Atlético Madrid,1,0,2,0,2,...,9899,Valencia vs Atlético Madrid,,,,,,,,1998
1,1,2.0,La Liga (Matchweek 1),Valencia,Atlético Madrid,1,0,2,0,2,...,9899,Valencia vs Atlético Madrid,,,,,,,,1998
2,2,3.0,La Liga (Matchweek 1),Alavés,Real Betis,0,0,4,0,4,...,9899,Alavés vs Real Betis,,,,,,,,1998
3,3,4.0,La Liga (Matchweek 1),Alavés,Real Betis,0,0,4,0,4,...,9899,Alavés vs Real Betis,,,,,,,,1998
4,4,5.0,La Liga (Matchweek 1),Racing Santander,Barcelona,0,0,3,0,7,...,9899,Racing Santander vs Barcelona,,,,,,,,1998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18235,18235,756.0,La Liga (Matchweek 38),Barcelona,Villarreal,0,2,3,0,1,...,2122,Barcelona vs Villarreal,442.0,,0.0,0.0,3.0,13.0,4.0,2021
18236,18236,757.0,La Liga (Matchweek 38),Sevilla,Athletic Club,1,0,3,0,3,...,2122,Sevilla vs Athletic Club,819.0,,0.0,0.0,11.0,47.0,7.0,2021
18237,18237,758.0,La Liga (Matchweek 38),Sevilla,Athletic Club,1,0,3,0,3,...,2122,Sevilla vs Athletic Club,501.0,,0.0,0.0,5.0,26.0,4.0,2021
18238,18238,759.0,La Liga (Matchweek 38),Real Sociedad,Atlético Madrid,1,2,4,0,2,...,2122,Real Sociedad vs Atlético Madrid,741.0,,0.0,0.0,8.0,33.0,2.0,2021


In [9]:
# Encode Matchweek variables

# Define the mapping from matchweek names to numeric IDs
jornada_to_matchweek_id = {
    "La Liga (Matchweek 1)": 1, "La Liga (Matchweek 2)": 2,
    "La Liga (Matchweek 3)": 3, "La Liga (Matchweek 4)": 4,
    "La Liga (Matchweek 5)": 5, "La Liga (Matchweek 6)": 6,
    "La Liga (Matchweek 7)": 7, "La Liga (Matchweek 8)": 8,
    "La Liga (Matchweek 9)": 9, "La Liga (Matchweek 10)": 10,
    "La Liga (Matchweek 11)": 11, "La Liga (Matchweek 12)": 12,
    "La Liga (Matchweek 13)": 13, "La Liga (Matchweek 14)": 14,
    "La Liga (Matchweek 15)": 15, "La Liga (Matchweek 16)": 16,
    "La Liga (Matchweek 17)": 17, "La Liga (Matchweek 18)": 18,
    "La Liga (Matchweek 19)": 19, "La Liga (Matchweek 20)": 20,
    "La Liga (Matchweek 21)": 21, "La Liga (Matchweek 22)": 22,
    "La Liga (Matchweek 23)": 23, "La Liga (Matchweek 24)": 24,
    "La Liga (Matchweek 25)": 25, "La Liga (Matchweek 26)": 26,
    "La Liga (Matchweek 27)": 27, "La Liga (Matchweek 28)": 28,
    "La Liga (Matchweek 29)": 29, "La Liga (Matchweek 30)": 30,
    "La Liga (Matchweek 31)": 31, "La Liga (Matchweek 32)": 32,
    "La Liga (Matchweek 33)": 33, "La Liga (Matchweek 34)": 34,
    "La Liga (Matchweek 35)": 35, "La Liga (Matchweek 36)": 36,
    "La Liga (Matchweek 37)": 37, "La Liga (Matchweek 38)": 38
}

DataPMatchFBREF.rename(columns = [])

# Convert 'Jornada' string to numeric 'Matchweek'
DataPMatchFBREF['Matchweek'] = rfef_cta['Jornada'].map(jornada_to_matchweek_id)

In [10]:
# Define 
varsaux = ['Gls', 'PK', 'PKatt', 'Fls', 'Fld', 'Off', 'Touches_Touches', 'DefPen_Touches', 
           'PKwon', 'PKcon', 'KP', 'Final_Third', 'PPA']

for vary in varsaux:
    for locvi in ['Home', 'Away']:  # Only two unique values in 'Home_Away'
        var = locvi + vary
        DataPMatchFBREF[var] = np.where(DataPMatchFBREF['Home_Away'] == locvi, 
                                        DataPMatchFBREF[vary], 0)
        
        if locvi == 'Away':
            DataPMatchFBREF[var] = DataPMatchFBREF[var].shift(periods=-1)  # Shift for 'Away' only


In [11]:
DataPMatchFBREF.drop(columns = ['CrdY', 'CrdR', 'Fls', 'Fld', 'Off', 'PKwon', 'PKcon', 'Touches_Touches',
                               'DefPen_Touches', 'Gls', 'PK', 'PKatt', 'KP', 'Final_Third', 'PPA'], 
                                index = 1, inplace = True)

DataPMatchFBREF = DataPMatchFBREF.loc[DataPMatchFBREF["Home_Away"] == 'Home']

In [13]:
DataPMatchFBREF.keys()

NameError: name 'xxx' is not defined

In [14]:
DataPMatchFBREF.to_excel('DataPMatchFBREF1.xlsx')

# MARKET VALUES

In [None]:
# Import data
mktime = ['0405', '0506', '0607', '0708', '0809', '0910', '1011', '1112', '1213', 
          '1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
mktval = {
    t: pd.read_excel(os.path.join(r'data\Transfermkt', f"mktvalue.xlsx"), sheet_name=t)
    .rename(columns={'Club': 'Equipo', 'Total market value' : 'TotalmarketvalueEUR'})
    .assign(Season=t)
    
    for t in mktime
}

# Concatenate all dataframes
mktval = pd.concat(mktval.values(), ignore_index=True)

In [None]:
# Standarize the name of teams
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Leganés' : 'Leganés', 
        'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'Córdoba CF' : 'Córdoba', 
        'Deportivo Alavés' : 'Alavés', 'Deportivo de La Coruña' : 'La Coruña', 
        'Elche CF' : 'Elche', 'FC Barcelona' : 'Barcelona', 
        'Getafe CF' : 'Getafe', 'Gimnàstic de Tarragona' : 'Gimnàstic', 
        'Girona FC' : 'Girona', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 
        'Málaga CF' : 'Málaga', 'Racing Santander' : 'Racing Sant', 
        'RCD Espanyol Barcelona' : 'Espanyol', 'RCD Mallorca' : 'Mallorca', 
        'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 'Real Murcia', 
        'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'SD Huesca' : 'Huesca', 
        'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'Valencia CF' : 'Valencia', 
        'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 'Villarreal', 
        'Xerez CD' : 'Xerez'
    }
    
    variables = ['Equipo']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(mktval)

In [None]:
mktvalueTot = mktval
mktvalueTot

# MARKET VALUES AND WAGES

In [None]:
twages = ['1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
wages = {
    t: pd.read_excel(os.path.join(r'data\FBREF', f"WAGES LaLiga.xlsx"), sheet_name=t)
    .rename(columns={'Squad': 'Equipo'})
    .assign(Season=t)
    
    for t in twages
}

# Concatenate all dataframes
wages = pd.concat(wages.values(), ignore_index=True)

In [None]:
wages

In [None]:
# Adjust for terms and symbols
def standard_teams(df):
    replacements = {
        'Ã¡' : 'á', 'Ã ' : 'à', 'Ã©' : 'é', 'Ã­' : 'í', 
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Girona FC' : 'Girona', 
        'SD Huesca' : 'Huesca', 
    }
    
    variables = ['Equipo']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

            
def standard_money(df):
    replacements = {        
        ', ' : '', 'â‚¬ ' : '', '£' : '', 
    }
    
    variables = ['Weekly Wages EUR', 'Weekly Wages GBP', 'Annual Wages EUR', 'Annual Wages GBP']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)
            
            

# Apply the function
standard_teams(wages)
standard_money(wages)

In [None]:
wages

In [None]:
# Time to merge mkt values with wages

In [None]:
# Define Year variable

In [None]:
# Drop variables

In [None]:
# Standarize the names of teams
# La Coruña -> Deportivo La Coruña
# Racing Sant -> Racing Santander

In [15]:
# Define home and away variables

In [None]:
# Display data, named as -> mktvalueTot

# Merge Mkt value and wages data with team stats

In [None]:
# Bring mktvalueTot and merge with SQVS
# Define variables

In [None]:
# Adjust Variables