In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506',
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Define the columns for newer and older seasons
new_season_cols = ['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score', 'Away_Score',
                   'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Yellow_Cards', 'Away_Red_Cards',
                   'Team', 'Home_Away', 'Gls', 'PK', 'PKatt', 'CrdY', 'CrdR']

old_season_cols = new_season_cols + ['Fls', 'Fld', 'Off']

# Define seasons with different columns
new_seasons = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
summ = {
    t: pd.read_excel(os.path.join(r'data\FBREF\summary', f"sum{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=new_season_cols if t in new_seasons else old_season_cols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in time
}

# Concatenate all dataframes
summary = pd.concat(summ.values(), ignore_index=True)

In [3]:
# Define the columns for newer and older seasons
posscols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'Touches_Touches', 'DefPen_Touches']

t2 = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
poss = {
    t: pd.read_excel(os.path.join(r'data\FBREF\possession', f"poss{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=posscols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
possession = pd.concat(poss.values(), ignore_index=True)

In [4]:
# Define the columns for newer and older seasons
misccols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'PKwon', 'PKcon']

# Loop through seasons and process the files
misc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\misc', f"dfmisc{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=misccols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
misc = pd.concat(misc.values(), ignore_index=True)

In [5]:
# Define the columns for newer and older seasons
pascols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'KP', 'Final_Third', 'PPA']

# Loop through seasons and process the files
passc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\passing', f"dfpassing{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=pascols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
passing = pd.concat(passc.values(), ignore_index=True)

In [6]:
mergin_vars = ['Matchweek', 'Season', 'Partido', 'Home_Team', 'Away_Team', 'Team', 'Home_Away']

DataPMatchFBREF = pd.merge(summary, possession, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, misc, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, passing, on = mergin_vars, how = 'outer')

In [7]:
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

DataPMatchFBREF['Year'] = DataPMatchFBREF['Season'].map(season_to_year)


In [8]:
#DataPMatchFBREF.info()

In [9]:
DataPMatchFBREF.keys()

Index(['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score',
       'Away_Score', 'Home_Yellow_Cards', 'Home_Red_Cards',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'Team', 'Home_Away', 'Gls', 'PK',
       'PKatt', 'CrdY', 'CrdR', 'Fls', 'Fld', 'Off', 'Season', 'Partido',
       'Touches_Touches', 'DefPen_Touches', 'PKwon', 'PKcon', 'KP',
       'Final_Third', 'PPA', 'Year'],
      dtype='object')

In [10]:
varsaux = ['Gls', 'PK', 'PKatt', 'Fls', 'Fld', 'Off', 'Touches_Touches', 'DefPen_Touches', 
           'PKwon', 'PKcon', 'KP', 'Final_Third', 'PPA']

for vary in varsaux:
    for locvi in ['Home', 'Away']:  # Only two unique values in 'Home_Away'
        var = locvi + vary
        DataPMatchFBREF[var] = np.where(DataPMatchFBREF['Home_Away'] == locvi, 
                                        DataPMatchFBREF[vary], 0)
        
        if locvi == 'Away':
            DataPMatchFBREF[var] = DataPMatchFBREF[var].shift(periods=-1)  # Shift for 'Away' only


In [11]:
DataPMatchFBREF.drop(columns = ['CrdY', 'CrdR', 'Fls', 'Fld', 'Off', 'PKwon', 'PKcon', 'Touches_Touches',
                               'DefPen_Touches', 'Gls', 'PK', 'PKatt', 'KP', 'Final_Third', 'PPA'], 
                                index = 1, inplace = True)

DataPMatchFBREF = DataPMatchFBREF.loc[DataPMatchFBREF["Home_Away"] == 'Home']

In [12]:
DataPMatchFBREF.keys()

Index(['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score',
       'Away_Score', 'Home_Yellow_Cards', 'Home_Red_Cards',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'Team', 'Home_Away', 'Season',
       'Partido', 'Year', 'HomeGls', 'AwayGls', 'HomePK', 'AwayPK',
       'HomePKatt', 'AwayPKatt', 'HomeFls', 'AwayFls', 'HomeFld', 'AwayFld',
       'HomeOff', 'AwayOff', 'HomeTouches_Touches', 'AwayTouches_Touches',
       'HomeDefPen_Touches', 'AwayDefPen_Touches', 'HomePKwon', 'AwayPKwon',
       'HomePKcon', 'AwayPKcon', 'HomeKP', 'AwayKP', 'HomeFinal_Third',
       'AwayFinal_Third', 'HomePPA', 'AwayPPA'],
      dtype='object')

In [13]:
DataPMatchFBREF.to_excel('DataPMatchFBREF.xlsx')

# MARKET VALUES

In [14]:
mktime = ['0405', '0506', '0607', '0708', '0809', '0910', '1011', '1112', '1213', 
          '1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
mktval = {
    t: pd.read_excel(os.path.join(r'data\Transfermkt', f"mktvalue.xlsx"), sheet_name=t)
    .rename(columns={'Club': 'Equipo', 'Total market value' : 'TotalmarketvalueEUR'})
    .assign(Season=t)
    
    for t in mktime
}

# Concatenate all dataframes
mktval = pd.concat(mktval.values(), ignore_index=True)

In [15]:
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Leganés' : 'Leganés', 
        'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'Córdoba CF' : 'Córdoba', 
        'Deportivo Alavés' : 'Alavés', 'Deportivo de La Coruña' : 'La Coruña', 
        'Elche CF' : 'Elche', 'FC Barcelona' : 'Barcelona', 
        'Getafe CF' : 'Getafe', 'Gimnàstic de Tarragona' : 'Gimnàstic', 
        'Girona FC' : 'Girona', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 
        'Málaga CF' : 'Málaga', 'Racing Santander' : 'Racing Sant', 
        'RCD Espanyol Barcelona' : 'Espanyol', 'RCD Mallorca' : 'Mallorca', 
        'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 'Real Murcia', 
        'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'SD Huesca' : 'Huesca', 
        'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'Valencia CF' : 'Valencia', 
        'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 'Villarreal', 
        'Xerez CD' : 'Xerez'
    }
    
    variables = ['Equipo']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(mktval)

In [16]:
mktval

Unnamed: 0,Equipo,Squad,ø age,Foreigners,ø market value,TotalmarketvalueEUR,Season
0,Real Madrid,34,25.5,10,€8.93m,€303.70m,0405
1,Barcelona,35,24.4,15,€7.80m,€273.03m,0405
2,Valencia,32,26.1,15,€6.12m,€195.90m,0405
3,La Coruña,29,27.7,8,€4.98m,€144.45m,0405
4,Atlético Madrid,36,25.8,11,€3.49m,€125.65m,0405
...,...,...,...,...,...,...,...
355,Mallorca,41,25.7,15,€1.81m,€74.15m,2122
356,Elche,37,27.4,13,€1.97m,€72.98m,2122
357,Cádiz,37,27.3,14,€1.83m,€67.80m,2122
358,Rayo Vallecano,36,26.2,14,€1.87m,€67.20m,2122


# MARKET VALUES AND WAGES

In [17]:
twages = ['1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
wages = {
    t: pd.read_excel(os.path.join(r'data\FBREF', f"WAGES LaLiga.xlsx"), sheet_name=t)
    .rename(columns={'Squad': 'Equipo'})
    .assign(Season=t)
    
    for t in twages
}

# Concatenate all dataframes
wages = pd.concat(wages.values(), ignore_index=True)

In [18]:
wages

Unnamed: 0,Rk,Equipo,# Pl,Weekly Wages EUR,Weekly Wages GBP,Weekly Wages USD,Annual Wages EUR,Annual Wages GBP,Annual Wages USD,% Estimated,Season
0,1,Barcelona,33.0,"â‚¬ 3,427,981","£2,874,391",3493495.0,"â‚¬ 178,255,000","£149,468,309",181661754.0,1.00,1314
1,2,Real Madrid,34.0,"â‚¬ 2,609,769","£2,188,313",2659646.0,"â‚¬ 135,708,000","£113,792,296",138301612.0,1.00,1314
2,3,AtlÃ©tico Madrid,41.0,"â‚¬ 936,096","£784,925",953987.0,"â‚¬ 48,677,000","£40,816,075",49607301.0,1.00,1314
3,4,Valencia,42.0,"â‚¬ 597,654","£501,138",609076.0,"â‚¬ 31,078,000","£26,059,168",31671952.0,1.00,1314
4,5,Sevilla,43.0,"â‚¬ 484,577","£406,322",493838.0,"â‚¬ 25,198,000","£21,128,735",25679578.0,1.00,1314
...,...,...,...,...,...,...,...,...,...,...,...
175,16,CÃ¡diz,89.0,"â‚¬ 383,654","£321,697",390986.0,"â‚¬ 19,950,000","£16,728,242",20331279.0,0.97,2122
176,17,Levante,83.0,"â‚¬ 375,865","£315,166",383049.0,"â‚¬ 19,545,000","£16,388,647",19918540.0,1.00,2122
177,18,Elche,73.0,"â‚¬ 371,442","£311,458",378541.0,"â‚¬ 19,315,000","£16,195,792",19684141.0,0.93,2122
178,19,Osasuna,78.0,"â‚¬ 325,000","£272,515",331211.0,"â‚¬ 16,900,000","£14,170,790",17222987.0,1.00,2122


In [19]:
def standard_teams(df):
    replacements = {
        'Ã¡' : 'á', 'Ã ' : 'à', 'Ã©' : 'é', 'Ã­' : 'í', 
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Girona FC' : 'Girona', 
        'SD Huesca' : 'Huesca', 
    }
    
    variables = ['Equipo']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

            
def standard_money(df):
    replacements = {        
        ', ' : '', 'â‚¬ ' : '', '£' : '', 
    }
    
    variables = ['Weekly Wages EUR', 'Weekly Wages GBP', 'Annual Wages EUR', 'Annual Wages GBP']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)
            
            

# Apply the function
standard_teams(wages)
standard_money(wages)

In [21]:
wages

Unnamed: 0,Rk,Equipo,# Pl,Weekly Wages EUR,Weekly Wages GBP,Weekly Wages USD,Annual Wages EUR,Annual Wages GBP,Annual Wages USD,% Estimated,Season
0,1,Barcelona,33.0,3427981,2874391,3493495.0,178255000,149468309,181661754.0,1.00,1314
1,2,Real Madrid,34.0,2609769,2188313,2659646.0,135708000,113792296,138301612.0,1.00,1314
2,3,Atlético Madrid,41.0,936096,784925,953987.0,48677000,40816075,49607301.0,1.00,1314
3,4,Valencia,42.0,597654,501138,609076.0,31078000,26059168,31671952.0,1.00,1314
4,5,Sevilla,43.0,484577,406322,493838.0,25198000,21128735,25679578.0,1.00,1314
...,...,...,...,...,...,...,...,...,...,...,...
175,16,Cádiz,89.0,383654,321697,390986.0,19950000,16728242,20331279.0,0.97,2122
176,17,Levante,83.0,375865,315166,383049.0,19545000,16388647,19918540.0,1.00,2122
177,18,Elche,73.0,371442,311458,378541.0,19315000,16195792,19684141.0,0.93,2122
178,19,Osasuna,78.0,325000,272515,331211.0,16900000,14170790,17222987.0,1.00,2122


In [None]:
notnice

In [None]:
Time to merge mkt values with wages

In [20]:
def standard_teams(df):
    replacements = {
        'Ã¡' : 'á', 'Ã ' : 'à', 'Ã©' : 'é', 'Ã­' : 'í', 
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Girona FC' : 'Girona', 
        'SD Huesca' : 'Huesca', 
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)


NameError: name 'missed' is not defined

In [None]:
def standard_teams(df):
    replacements = {        
        ', ' : '', 'â‚¬ ' : '', '£' : '', 
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)


In [None]:
def standard_teams(df):
    replacements = {
    }
    
    variables = ['Home_Team', 'Away_Team', 'PenaltyTaker', 'PenaltyConc']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(missed)
