In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Summary data
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506',
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Define the columns for newer and older seasons
new_season_cols = ['index', 'Matchweek', 'Home_Team', 'Away_Team', 'Home_Score', 'Away_Score',
                   'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Yellow_Cards', 'Away_Red_Cards',
                   'Team', 'Home_Away', 'Gls', 'PK', 'PKatt', 'CrdY', 'CrdR']

old_season_cols = new_season_cols + ['Fls', 'Fld', 'Off']

# Define seasons with different columns
new_seasons = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
summ = {
    t: pd.read_excel(os.path.join(r'data\FBREF\summary', f"sum{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=new_season_cols if t in new_seasons else old_season_cols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in time
}

# Concatenate all dataframes
summary = pd.concat(summ.values(), ignore_index=True)

In [3]:
# Possession data

# Define the columns for newer and older seasons
posscols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'Touches_Touches', 'DefPen_Touches']

t2 = {'1718', '1819', '1920', '2021', '2122'}

# Loop through seasons and process the files
poss = {
    t: pd.read_excel(os.path.join(r'data\FBREF\possession', f"poss{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=posscols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
possession = pd.concat(poss.values(), ignore_index=True)

In [4]:
# Miscelaneous data

# Define the columns for newer and older seasons
misccols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'PKwon', 'PKcon']

# Loop through seasons and process the files
misc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\misc', f"dfmisc{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=misccols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
misc = pd.concat(misc.values(), ignore_index=True)

In [5]:
# Passing data

# Define the columns for newer and older seasons
pascols = ['Matchweek', 'Home_Team', 'Away_Team', 'Team', 'Home_Away', 'KP', 'Final_Third', 'PPA']

# Loop through seasons and process the files
passc = {
    t: pd.read_excel(os.path.join(r'data\FBREF\passing', f"dfpassing{t}.xlsx"), sheet_name=0)
    .rename(columns={'Unnamed: 0': 'index'})
    .filter(items=pascols)
    .assign(Season=t, Partido=lambda df: df['Home_Team'] + " vs " + df['Away_Team'])

    for t in t2
}

# Concatenate all dataframes
passing = pd.concat(passc.values(), ignore_index=True)

In [6]:
# Mergin data
mergin_vars = ['Matchweek', 'Season', 'Partido', 'Home_Team', 'Away_Team', 'Team', 'Home_Away']

DataPMatchFBREF = pd.merge(summary, possession, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, misc, on = mergin_vars, how = 'outer')
DataPMatchFBREF = pd.merge(DataPMatchFBREF, passing, on = mergin_vars, how = 'outer')

In [7]:
# Define Year variable
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

DataPMatchFBREF['Year'] = DataPMatchFBREF['Season'].map(season_to_year)

In [8]:
DataPMatchFBREF.reset_index()

Unnamed: 0,level_0,index,Matchweek,Home_Team,Away_Team,Home_Score,Away_Score,Home_Yellow_Cards,Home_Red_Cards,Away_Yellow_Cards,...,Season,Partido,Touches_Touches,DefPen_Touches,PKwon,PKcon,KP,Final_Third,PPA,Year
0,0,1.0,La Liga (Matchweek 1),Valencia,Atlético Madrid,1,0,2,0,2,...,9899,Valencia vs Atlético Madrid,,,,,,,,1998
1,1,2.0,La Liga (Matchweek 1),Valencia,Atlético Madrid,1,0,2,0,2,...,9899,Valencia vs Atlético Madrid,,,,,,,,1998
2,2,3.0,La Liga (Matchweek 1),Alavés,Real Betis,0,0,4,0,4,...,9899,Alavés vs Real Betis,,,,,,,,1998
3,3,4.0,La Liga (Matchweek 1),Alavés,Real Betis,0,0,4,0,4,...,9899,Alavés vs Real Betis,,,,,,,,1998
4,4,5.0,La Liga (Matchweek 1),Racing Santander,Barcelona,0,0,3,0,7,...,9899,Racing Santander vs Barcelona,,,,,,,,1998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18235,18235,756.0,La Liga (Matchweek 38),Barcelona,Villarreal,0,2,3,0,1,...,2122,Barcelona vs Villarreal,442.0,,0.0,0.0,3.0,13.0,4.0,2021
18236,18236,757.0,La Liga (Matchweek 38),Sevilla,Athletic Club,1,0,3,0,3,...,2122,Sevilla vs Athletic Club,819.0,,0.0,0.0,11.0,47.0,7.0,2021
18237,18237,758.0,La Liga (Matchweek 38),Sevilla,Athletic Club,1,0,3,0,3,...,2122,Sevilla vs Athletic Club,501.0,,0.0,0.0,5.0,26.0,4.0,2021
18238,18238,759.0,La Liga (Matchweek 38),Real Sociedad,Atlético Madrid,1,2,4,0,2,...,2122,Real Sociedad vs Atlético Madrid,741.0,,0.0,0.0,8.0,33.0,2.0,2021


In [9]:
# Encode Matchweek variables

# Define the mapping from matchweek names to numeric IDs
jornada_to_matchweek_id = {
    "La Liga (Matchweek 1)": 1, "La Liga (Matchweek 2)": 2,
    "La Liga (Matchweek 3)": 3, "La Liga (Matchweek 4)": 4,
    "La Liga (Matchweek 5)": 5, "La Liga (Matchweek 6)": 6,
    "La Liga (Matchweek 7)": 7, "La Liga (Matchweek 8)": 8,
    "La Liga (Matchweek 9)": 9, "La Liga (Matchweek 10)": 10,
    "La Liga (Matchweek 11)": 11, "La Liga (Matchweek 12)": 12,
    "La Liga (Matchweek 13)": 13, "La Liga (Matchweek 14)": 14,
    "La Liga (Matchweek 15)": 15, "La Liga (Matchweek 16)": 16,
    "La Liga (Matchweek 17)": 17, "La Liga (Matchweek 18)": 18,
    "La Liga (Matchweek 19)": 19, "La Liga (Matchweek 20)": 20,
    "La Liga (Matchweek 21)": 21, "La Liga (Matchweek 22)": 22,
    "La Liga (Matchweek 23)": 23, "La Liga (Matchweek 24)": 24,
    "La Liga (Matchweek 25)": 25, "La Liga (Matchweek 26)": 26,
    "La Liga (Matchweek 27)": 27, "La Liga (Matchweek 28)": 28,
    "La Liga (Matchweek 29)": 29, "La Liga (Matchweek 30)": 30,
    "La Liga (Matchweek 31)": 31, "La Liga (Matchweek 32)": 32,
    "La Liga (Matchweek 33)": 33, "La Liga (Matchweek 34)": 34,
    "La Liga (Matchweek 35)": 35, "La Liga (Matchweek 36)": 36,
    "La Liga (Matchweek 37)": 37, "La Liga (Matchweek 38)": 38
}

DataPMatchFBREF.rename(columns = {'Matchweek' : 'Jornada'}, inplace = True)

# Convert 'Jornada' string to numeric 'Matchweek'
DataPMatchFBREF['Matchweek'] = DataPMatchFBREF['Jornada'].map(jornada_to_matchweek_id)

In [10]:
# Encode Local Variable
# Define the mapping from Home_Away names to numeric IDs
Local = {"Home": 1, "Away": 2}

# Convert 'Jornada' string to numeric 'Matchweek'
DataPMatchFBREF['Local'] = DataPMatchFBREF['Home_Away'].map(Local)

In [11]:
# Sort and Order Variables
DataPMatchFBREF = DataPMatchFBREF.sort_values(
    by = ['Year', 'Matchweek', 'Partido', 'Local'], ascending = [True, True, True, True])

DataPMatchFBREF = DataPMatchFBREF[['Year', 'Matchweek', 'Season', 'Home_Team', 
                    'Away_Team', 'Team', 'Partido', 'Home_Away', 'Home_Score', 'Home_Yellow_Cards',
                    'Home_Red_Cards', 'Away_Score', 'Away_Yellow_Cards', 'Away_Red_Cards', 'CrdY',
                    'CrdR', 'Fls', 'Fld', 'Off', 'PKwon', 'PKcon', 'Touches_Touches', 'DefPen_Touches',
                    'Gls', 'PK', 'PKatt', 'KP', 'Final_Third', 'PPA']]

DataPMatchFBREF.head()

Unnamed: 0,Year,Matchweek,Season,Home_Team,Away_Team,Team,Partido,Home_Away,Home_Score,Home_Yellow_Cards,...,PKwon,PKcon,Touches_Touches,DefPen_Touches,Gls,PK,PKatt,KP,Final_Third,PPA
2,1998,1,9899,Alavés,Real Betis,Alavés,Alavés vs Real Betis,Home,0,4,...,,,,,0,0,,,,
3,1998,1,9899,Alavés,Real Betis,Real Betis,Alavés vs Real Betis,Away,0,4,...,,,,,0,0,,,,
6,1998,1,9899,Celta Vigo,Deportivo La Coruña,Celta Vigo,Celta Vigo vs Deportivo La Coruña,Home,0,3,...,,,,,0,0,,,,
7,1998,1,9899,Celta Vigo,Deportivo La Coruña,Deportivo La Coruña,Celta Vigo vs Deportivo La Coruña,Away,0,3,...,,,,,0,0,,,,
10,1998,1,9899,Espanyol,Tenerife,Espanyol,Espanyol vs Tenerife,Home,2,4,...,,,,,2,0,,,,


In [12]:
# Define Local and Away values
varsaux = ['Gls', 'PK', 'PKatt', 'Fls', 'Fld', 'Off', 'Touches_Touches', 'DefPen_Touches', 
           'PKwon', 'PKcon', 'KP', 'Final_Third', 'PPA']

for vary in varsaux:
    for locvi in ['Home', 'Away']:  # Only two unique values in 'Home_Away'
        var = locvi + vary
        DataPMatchFBREF[var] = np.where(DataPMatchFBREF['Home_Away'] == locvi, 
                                        DataPMatchFBREF[vary], 0)
        
        if locvi == 'Away':
            DataPMatchFBREF[var] = DataPMatchFBREF[var].shift(periods=-1)  # Shift for 'Away' only


In [13]:
# Drop redundant variables and columns
DataPMatchFBREF.drop(columns = ['CrdY', 'CrdR', 'Fls', 'Fld', 'Off', 'PKwon', 'PKcon', 'Touches_Touches',
                               'DefPen_Touches', 'Gls', 'PK', 'PKatt', 'KP', 'Final_Third', 'PPA'], 
                                index = 1, inplace = True)

DataPMatchFBREF = DataPMatchFBREF.loc[DataPMatchFBREF["Home_Away"] == 'Home']

# Destring Season
DataPMatchFBREF['Season'] = DataPMatchFBREF['Season'].astype(str)

In [14]:
DataPMatchFBREF.to_excel('Datasets/DataPMatchFBREF.xlsx')

In [15]:
# We need to merge with Datasets/FootballUk/FootballUKdata.xlsx
# and check that data is the same, we have the same number of columns tho. With the exception of Matchweek/Jornada Case
# and the local / home_away

# Import data to merge
FootballUKdata = pd.read_excel('Datasets\FootballUK\FootballUKdata.xlsx', converters = {'Season' : str})
FootballUKdata.columns

FootballUKdata['Season'] = FootballUKdata['Season'].astype(str)

# Merge data
DataPMatchFBREF = pd.merge(DataPMatchFBREF, FootballUKdata, on = ['Season', 'Partido'], 
             validate = 'many_to_many', how = 'outer', indicator = True)

# Adjust columns
DataPMatchFBREF = DataPMatchFBREF.drop(columns = ['Year_y'])
DataPMatchFBREF = DataPMatchFBREF.rename(columns = {'Year_x' : 'Year'})

In [16]:
# Save output
DataPMatchFBREF.to_excel('Datasets/DataPMatchFBREF.xlsx')

# MARKET VALUES

In [17]:
# Import data
mktime = ['0405', '0506', '0607', '0708', '0809', '0910', '1011', '1112', '1213', 
          '1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
mktval = {
    t: pd.read_excel(os.path.join(r'data\Transfermkt', f"mktvalue.xlsx"), sheet_name=t)
    .rename(columns={'Club': 'Equipo', 'Total market value' : 'TotalmarketvalueEUR'})
    .assign(Season=t)
    
    for t in mktime
}

# Concatenate all dataframes
mktvalueTot = pd.concat(mktval.values(), ignore_index=True)

In [18]:
# Standarize the name of teams
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Leganés' : 'Leganés', 
        'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'Córdoba CF' : 'Córdoba', 
        'Deportivo Alavés' : 'Alavés', 'Deportivo de La Coruña' : 'La Coruña', 
        'Elche CF' : 'Elche', 'FC Barcelona' : 'Barcelona', 
        'Getafe CF' : 'Getafe', 'Gimnàstic de Tarragona' : 'Gimnàstic', 
        'Girona FC' : 'Girona', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 
        'Málaga CF' : 'Málaga', 'Racing Santander' : 'Racing Sant', 
        'RCD Espanyol Barcelona' : 'Espanyol', 'RCD Mallorca' : 'Mallorca', 
        'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 'Real Murcia', 
        'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'SD Huesca' : 'Huesca', 
        'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'Valencia CF' : 'Valencia', 
        'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 'Villarreal', 
        'Xerez CD' : 'Xerez'
    }
    
    variables = ['Equipo']
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(mktvalueTot)

In [19]:
# Define dataset
mktvalueTot.head()

# Save output
mktvalueTot.to_excel('Datasets/mktvalueTot.xlsx')

# MARKET VALUES AND WAGES

In [20]:
twages = ['1314', '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

# Loop through seasons and process the files
wages = {
    t: pd.read_excel(os.path.join(r'data\FBREF', f"WAGES LaLiga.xlsx"), sheet_name=t)
    .rename(columns={'Squad': 'Equipo'})
    .assign(Season=t)
    
    for t in twages
}

# Concatenate all dataframes
wages = pd.concat(wages.values(), ignore_index=True)

# Display dataset
wages.head()

Unnamed: 0,Rk,Equipo,# Pl,Weekly Wages EUR,Weekly Wages GBP,Weekly Wages USD,Annual Wages EUR,Annual Wages GBP,Annual Wages USD,% Estimated,Season
0,1,Barcelona,33.0,"â‚¬ 3,427,981","£2,874,391",3493495.0,"â‚¬ 178,255,000","£149,468,309",181661754.0,1.0,1314
1,2,Real Madrid,34.0,"â‚¬ 2,609,769","£2,188,313",2659646.0,"â‚¬ 135,708,000","£113,792,296",138301612.0,1.0,1314
2,3,AtlÃ©tico Madrid,41.0,"â‚¬ 936,096","£784,925",953987.0,"â‚¬ 48,677,000","£40,816,075",49607301.0,1.0,1314
3,4,Valencia,42.0,"â‚¬ 597,654","£501,138",609076.0,"â‚¬ 31,078,000","£26,059,168",31671952.0,1.0,1314
4,5,Sevilla,43.0,"â‚¬ 484,577","£406,322",493838.0,"â‚¬ 25,198,000","£21,128,735",25679578.0,1.0,1314


In [21]:
# Adjust for terms and symbols
def standard_teams(df):
    replacements = {
        'Ã¡' : 'á', 'Ã ' : 'à', 'Ã©' : 'é', 'Ã­' : 'í', 
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Girona FC' : 'Girona', 
        'SD Huesca' : 'Huesca', 
    }
    
    variables = ['Equipo']
    
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)

            
def standard_money(df):
    replacements = {        
        ', ' : '', 
        'â‚¬ ' : '', 
        '£' : '', 
    }
    
    variables = ['Weekly Wages EUR', 'Weekly Wages GBP', 'Annual Wages EUR', 'Annual Wages GBP']
    
    for vars in variables:
        for old_value, new_value in replacements.items():
            df[vars] = df[vars].str.replace(old_value, new_value, regex = True)
            
            

# Apply the function
standard_teams(wages)
standard_money(wages)

In [22]:
# Time to merge mkt values with wages
mktvalueTot = pd.merge(wages, mktvalueTot, on = ['Equipo', 'Season'], how = 'outer',
                      validate = 'one_to_one')
mktvalueTot.columns

Index(['Rk', 'Equipo', '# Pl', 'Weekly Wages EUR', 'Weekly Wages GBP',
       'Weekly Wages USD', 'Annual Wages EUR', 'Annual Wages GBP',
       'Annual Wages USD', '% Estimated', 'Season', 'Squad', 'ø age',
       'Foreigners', 'ø market value', 'TotalmarketvalueEUR'],
      dtype='object')

In [23]:
# Define Year variable

# Define the mapping
season_to_year = {
    '0405' : 2004, '0506' : 2005,
    '0607' : 2006, '0708' : 2007,
    '0809' : 2008, '0910' : 2009,
    '1011' : 2010, '1112' : 2011,
    '1213' : 2012, '1314' : 2013,
    '1415' : 2014, '1516' : 2015,
    '1617' : 2016, '1718' : 2017,
    '1819' : 2018, '1920' : 2019,
    '2021' : 2020, '2122' : 2021
}

mktvalueTot['Year'] = mktvalueTot['Season'].map(season_to_year)

mktvalueTot.head()

Unnamed: 0,Rk,Equipo,# Pl,Weekly Wages EUR,Weekly Wages GBP,Weekly Wages USD,Annual Wages EUR,Annual Wages GBP,Annual Wages USD,% Estimated,Season,Squad,ø age,Foreigners,ø market value,TotalmarketvalueEUR,Year
0,1.0,Barcelona,33.0,3427981,2874391,3493495.0,178255000,149468309,181661754.0,1.0,1314,29,26.2,10,€20.42m,€592.20m,2013
1,2.0,Real Madrid,34.0,2609769,2188313,2659646.0,135708000,113792296,138301612.0,1.0,1314,32,25.1,14,€19.90m,€636.80m,2013
2,3.0,Atlético Madrid,41.0,936096,784925,953987.0,48677000,40816075,49607301.0,1.0,1314,35,25.0,19,€8.82m,€308.80m,2013
3,4.0,Valencia,42.0,597654,501138,609076.0,31078000,26059168,31671952.0,1.0,1314,42,24.7,21,€4.26m,€178.80m,2013
4,5.0,Sevilla,43.0,484577,406322,493838.0,25198000,21128735,25679578.0,1.0,1314,37,24.5,19,€3.81m,€141.15m,2013


In [24]:
# Drop variables
mktvalueTot =mktvalueTot.drop(columns = ['Weekly Wages EUR', 'Weekly Wages GBP',
                'Weekly Wages USD', 'Annual Wages GBP', '% Estimated', 'Squad', 
                    'ø age', 'Foreigners', 'ø market value', 'Rk', '# Pl'])

In [25]:
# Standarize the names of teams

# La Coruña -> Deportivo La Coruña
# Racing Sant -> Racing Santander

mktvalueTot['Equipo'] = mktvalueTot['Equipo'].str.replace('La Coruña', 'Deportivo La Coruña')
mktvalueTot['Equipo'] = mktvalueTot['Equipo'].str.replace('Racing Sant', 'Racing Santander')

In [26]:
# Define home and away variables
mktvalueTot['Home_Team'] = mktvalueTot['Equipo']
mktvalueTot['Away_Team'] = mktvalueTot['Equipo']

In [27]:
# Display data, named as -> mktvalueTot
mktvalueTot.head()

Unnamed: 0,Equipo,Annual Wages EUR,Annual Wages USD,Season,TotalmarketvalueEUR,Year,Home_Team,Away_Team
0,Barcelona,178255000,181661754.0,1314,€592.20m,2013,Barcelona,Barcelona
1,Real Madrid,135708000,138301612.0,1314,€636.80m,2013,Real Madrid,Real Madrid
2,Atlético Madrid,48677000,49607301.0,1314,€308.80m,2013,Atlético Madrid,Atlético Madrid
3,Valencia,31078000,31671952.0,1314,€178.80m,2013,Valencia,Valencia
4,Sevilla,25198000,25679578.0,1314,€141.15m,2013,Sevilla,Sevilla


# Merge Mkt value and wages data with team stats

In [28]:
# Bring mktvalueTot and merge with SQVS
SQVS = pd.read_excel('Datasets/SQVS.xlsx', converters = {'Season' : str})
SQVS['Equipo'] = SQVS['Equipo'].str.replace('La Coruña', 'Deportivo La Coruña', regex = True)
SQVS['Equipo'] = SQVS['Equipo'].str.replace('Racing Sant', 'Racing Santander')

DATA_PRE = pd.merge(mktvalueTot, SQVS, on = ['Season', 'Equipo'], how = 'outer', 
                    validate = 'one_to_one')

In [29]:
# Define variables
DATA_PRE['TotMktValueStr'] = DATA_PRE['TotalmarketvalueEUR']


# Adjust Variables
DATA_PRE['TotalmarketvalueEUR'] = DATA_PRE['TotalmarketvalueEUR'].str.replace('m', '')
DATA_PRE['TotalmarketvalueEUR'] = DATA_PRE['TotalmarketvalueEUR'].str.replace('bn', '')
DATA_PRE['TotalmarketvalueEUR'] = DATA_PRE['TotalmarketvalueEUR'].str.replace('€', '')

# Destring
DATA_PRE['TotalmarketvalueEUR'] = DATA_PRE['TotalmarketvalueEUR'].astype(float)

# Adjust values
DATA_PRE['TotalmarketvalueEUR'] = np.where( (DATA_PRE['TotMktValueStr'].str.contains('m')), 
                                        DATA_PRE['TotalmarketvalueEUR'] * 1000000,
                                    np.where( (DATA_PRE['TotMktValueStr'].str.contains('bn')),
                                        DATA_PRE['TotalmarketvalueEUR'] * 1000000000,
                                             DATA_PRE['TotalmarketvalueEUR']))

In [30]:
# Adjust columns
DATA_PRE['Year_x'] = np.where( (DATA_PRE['Year_x'].isna()),
                               DATA_PRE['Year_y'], DATA_PRE['Year_x'])

DATA_PRE = DATA_PRE.drop(columns = ['Year_y'])
DATA_PRE = DATA_PRE.rename(columns = {'Year_x' : 'Year'})

In [31]:
# Keep variables of interest and order them
DATA_PRE = DATA_PRE[['Equipo', 'Annual Wages EUR', 'Annual Wages USD', 
                    'Season', 'Year', 'Home_Team', 'Away_Team', 'RL', 'PJ', 
                    'TA_contra', 'TR_contra', 'TA2_contra', 'TA_favor', 
                    'TR_favor', 'TA2_favor', 'TPint_favor', 'TPint_contra', 
                    'Fls_cometidasPor', 'Fls_recibidasContra', 'Offside_contra',
                    'PG', 'PE', 'PP', 'DG', 'TotMktValueStr', 'TotalmarketvalueEUR']]

In [32]:
DATA_PRE.to_excel('DATA_PRE.xlsx')

# Merge data per match with quality/value data

In [33]:
DataPMatchFBREF = pd.read_excel('Datasets/DataPMatchFBREF.xlsx', converters= {'Season' : str})

In [34]:
DataPMatchFBREF['Season'].unique()

array(['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506',
       '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314',
       '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122'],
      dtype=object)

In [35]:
# Import DataPMatchFBREF.dta
DataPMatchFBREF.head()

# Keep relevant variables
DataPMatchFBREF = DataPMatchFBREF[['Year', 'Matchweek', 'Season', 'Home_Team', 'Away_Team', 
                                   'Partido', 'Home_Score', 'Home_Yellow_Cards', 'Away_Yellow_Cards',
                                   'Home_Red_Cards', 'Away_Red_Cards', 'Away_Score', 'HomeOff', 'AwayOff', 
                                   'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 
                                   'HomePKatt', 'AwayPKatt', 'Date', 'FullTime', 
                                   'homefouls', 'awayfouls']]

In [36]:
# Define variables
DataPMatchFBREF['AwayFld'] = DataPMatchFBREF['homefouls']
DataPMatchFBREF['HomeFld'] = DataPMatchFBREF['awayfouls']

DataPMatchFBREF['HomePKcon'] = DataPMatchFBREF['AwayPKatt']
DataPMatchFBREF['AwayPKcon'] = DataPMatchFBREF['HomePKatt']

DataPMatchFBREF = DataPMatchFBREF.rename(columns = {'homefouls' : 'Home_Fouls', 
                                                    'awayfouls' : 'Away_Fouls'})

In [37]:
# Encode variables
DataPMatchFBREF['FullTime'] = np.where((DataPMatchFBREF['FullTime'] == 'H'), 'Home',
                                np.where((DataPMatchFBREF['FullTime'] == 'A'), 'Away', 'Draw'))

# Define Year variable
FulltoFinal = {
    'Home': 1, 'Away': 3, 'Draw': 2
}

DataPMatchFBREF['FinalResult'] = DataPMatchFBREF['FullTime'].map(FulltoFinal)

In [38]:
# Standarize team name
DataPMatchFBREF['Home_Team'] = DataPMatchFBREF['Home_Team'].str.replace('Real Betis', 'Betis')
DataPMatchFBREF['Away_Team'] = DataPMatchFBREF['Away_Team'].str.replace('Real Betis', 'Betis')

In [39]:
# Merge DataPMatchFBREF with market value and wages
Merge1 = pd.merge(DataPMatchFBREF, mktvalueTot, on = ['Home_Team', 'Season'], how = 'outer',
                     validate = 'many_to_many')

vars = ['TotalmarketvalueEUR', 'Annual Wages EUR', 'Equipo']
    
for var in vars:
    aux = 'Home' + var
    Merge1[aux] = Merge1[var]
        
Merge1 = Merge1.drop(columns = ['Away_Team_y', 'Year_y', 'TotalmarketvalueEUR', 'Annual Wages EUR', 
                                'Annual Wages USD', 'Equipo'])
Merge1.rename(columns = {'Year_x' : 'Year', 'Away_Team_x' : 'Away_Team'}, inplace = True)

In [40]:
# Merge DataPMatchFBREF with market value and wages
Merge1 = pd.merge(Merge1, mktvalueTot, on = ['Away_Team', 'Season'], how = 'outer',
                     validate = 'many_to_many')

In [41]:
vars = ['TotalmarketvalueEUR', 'Annual Wages EUR', 'Equipo']
    
for var in vars:
    aux = 'Away' + var
    Merge1[aux] = Merge1[var]
        
Merge1 = Merge1.drop(columns = ['Home_Team_y', 'Year_y', 'TotalmarketvalueEUR', 'Annual Wages EUR', 
                                'Annual Wages USD', 'Equipo'])
Merge1.rename(columns = {'Year_x' : 'Year', 'Home_Team_x' : 'Home_Team'}, inplace = True)

In [42]:
# Destring total market variables
local = ['Home', 'Away']

for var in local:
    # Define aux variables
    aux = var + 'TotalmarketvalueEUR'
    aux_str = aux + 'str'
    
    Merge1[aux_str] = Merge1[aux]
    
    # Clean obs
    Merge1[aux] = Merge1[aux].str.replace('€', '')
    Merge1[aux] = Merge1[aux].str.replace('m', '')
    Merge1[aux] = Merge1[aux].str.replace('bn', '')

    # Destring variable
    Merge1[aux] = Merge1[aux].astype(float)
    
    # Adjust value to Millions
    Merge1[aux] = np.where( (Merge1[aux_str].str.contains('m')), 
                                    Merge1[aux] * 1000000, 
                                    Merge1[aux] * 1000000000)

# Drop columns
Merge1 = Merge1.drop(columns = ['HomeTotalmarketvalueEURstr', 'AwayTotalmarketvalueEURstr'])

# Rename variables
Merge1 = Merge1.rename(columns = {'HomeTotalmarketvalueEUR' : 'HomeMktValue', 
                                    'AwayTotalmarketvalueEUR' : 'AwayMktValue'})

In [43]:
Merge1['Partido'] = Merge1['Partido'].str.replace('Real Betis', 'Betis')

In [44]:
Merge1.columns

Index(['Year', 'Matchweek', 'Season', 'Home_Team', 'Away_Team', 'Partido',
       'Home_Score', 'Home_Yellow_Cards', 'Away_Yellow_Cards',
       'Home_Red_Cards', 'Away_Red_Cards', 'Away_Score', 'HomeOff', 'AwayOff',
       'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 'HomePKatt', 'AwayPKatt',
       'Date', 'FullTime', 'Home_Fouls', 'Away_Fouls', 'AwayFld', 'HomeFld',
       'FinalResult', 'HomeMktValue', 'HomeAnnual Wages EUR', 'HomeEquipo',
       'AwayMktValue', 'AwayAnnual Wages EUR', 'AwayEquipo'],
      dtype='object')

In [45]:
# Order and sort variables
Merge1 = Merge1[['Year', 'Matchweek', 'Season', 'Home_Team', 'Away_Team', 
                 'Partido', 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 
                 'Away_Score', 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomeOff', 
                 'AwayOff', 'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 
                 'HomePKatt', 'AwayPKatt', 'Date', 'Home_Fouls', 'Away_Fouls', 
                 'AwayFld', 'HomeFld', 'FinalResult', 'HomeEquipo', 
                 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue']]

In [46]:
Merge1.to_excel('Datasets/merge1.xlsx', index = False)