In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Import data
PenaltiesConceded = pd.read_excel('Datasets/PenaltiesConceded.xlsx')
Merge1 = pd.read_excel('Datasets/Merge1.xlsx', dtype = {'Season' : str})

# Visualize columns
print(PenaltiesConceded.columns)
print(Merge1.columns)

Index(['Year', 'Matchday', 'Home_Team', 'Away_Team', 'HomeTotPKcon',
       'AwayTotPKcon', 'HomeTotPKatt', 'AwayTotPKatt', 'Partido'],
      dtype='object')
Index(['Year', 'Matchweek', 'Season', 'Home_Team', 'Away_Team', 'Partido',
       'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'HomeOff', 'AwayOff',
       'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 'HomePKatt', 'AwayPKatt',
       'Date', 'Home_Fouls', 'Away_Fouls', 'AwayFld', 'HomeFld', 'FinalResult',
       'HomeEquipo', 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo',
       'AwayAnnual Wages EUR', 'AwayMktValue'],
      dtype='object')


In [3]:
print(PenaltiesConceded['Year'].unique())
print(Merge1['Season'].unique())

[2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 1999 2000 2001 1998
 2003 2002]
['9899' '9900' '0001' '0102' '0203' '0304' '0405' '0506' '0607' '0708'
 '0809' '0910' '1011' '1112' '1213' '1314' '1415' '1516' '1617' '1718'
 '1819' '1920' '2021' '2122']


In [4]:
# Merge data
FullTeamProbit = pd.merge(PenaltiesConceded, Merge1, on = ['Partido', 'Year'],
                         indicator = True, validate = 'many_to_many', how = 'outer')

FullTeamProbit = FullTeamProbit.drop(columns = ['Home_Team_x', 'Away_Team_x'])
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Team_y' : 'Home_Team', 'Away_Team_y' : 'Away_Team'})

FullTeamProbit = FullTeamProbit.sort_values(by = ['Year', 'Matchweek'])

In [5]:
# Set penalty conceded and penalty attempted variables to 0 for years before 2014
vars = ['HomePKcon', 'AwayPKcon', 'HomePKatt', 'AwayPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit['Year'] < 2014), 
                             0, FullTeamProbit[var])
    
# Replace missing values for total penalties conceded and attempted with 0
vars = ['HomeTotPKcon', 'AwayTotPKcon', 'HomeTotPKatt', 'AwayTotPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [6]:
# Generate combined penalty concession and attempt variables for home and away teams
FullTeamProbit['HomePenConc'] = FullTeamProbit['HomeTotPKcon'] + FullTeamProbit['HomePKcon']
FullTeamProbit['AwayPenConc'] = FullTeamProbit['AwayTotPKcon'] + FullTeamProbit['AwayPKcon']
FullTeamProbit['HomePenAtt'] = FullTeamProbit['HomeTotPKatt'] + FullTeamProbit['HomePKatt']
FullTeamProbit['AwayPenAtt'] = FullTeamProbit['AwayTotPKatt'] + FullTeamProbit['AwayPKatt']

In [7]:
FullTeamProbit.columns

Index(['Year', 'Matchday', 'HomeTotPKcon', 'AwayTotPKcon', 'HomeTotPKatt',
       'AwayTotPKatt', 'Partido', 'Matchweek', 'Season', 'Home_Team',
       'Away_Team', 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards',
       'Away_Score', 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomeOff',
       'AwayOff', 'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 'HomePKatt',
       'AwayPKatt', 'Date', 'Home_Fouls', 'Away_Fouls', 'AwayFld', 'HomeFld',
       'FinalResult', 'HomeEquipo', 'HomeAnnual Wages EUR', 'HomeMktValue',
       'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue', '_merge',
       'HomePenConc', 'AwayPenConc', 'HomePenAtt', 'AwayPenAtt'],
      dtype='object')

In [8]:
# Keep relevant variables and order
FullTeamProbit = FullTeamProbit[['Home_Team', 'Away_Team', 'Partido', 'Matchweek', 'Season', 
                                 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
                                 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomePenConc', 'AwayPenConc',
                                 'HomePenAtt', 'AwayPenAtt', 'HomeOff', 'AwayOff', 'Date', 'Home_Fouls',
                                 'Away_Fouls', 'AwayFld', 'HomeFld', 'FinalResult', 'HomeEquipo',
                                 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue']]

In [9]:
# Import data
MergeFootballLineups = pd.read_excel('Datasets/FootballLineups/MergeFootballLineups.xlsx', 
                                     dtype = {'Season' : str})

In [10]:
# Merge
FullTeamProbit = pd.merge(FullTeamProbit, MergeFootballLineups, on = ['Partido', 'Season'],
                         how = 'outer', validate = 'many_to_many', indicator = True)

In [11]:
# Replace missing values for various game statistics with 0
variables = []
for var in variables:
    FullTeamProbit[var] = np.where( (FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [12]:
# Generate total offsides variables by summing individual offsides counts
FullTeamProbit['HomeOffs'] = FullTeamProbit['HomeOff'] + FullTeamProbit['Home_Offsides']
FullTeamProbit['AwayOffs'] = FullTeamProbit['HomeOff'] + FullTeamProbit['Away_Offsides']

In [13]:
locality = ['Home', 'Away']
for local in locality:
    varx = local + '_Fouls_x'
    vary = local + '_Fouls_y'
        
    FullTeamProbit[varx] = np.where( (FullTeamProbit[varx] != FullTeamProbit[vary]) & (~FullTeamProbit[vary].isna()),
                                         FullTeamProbit[vary],
                                         FullTeamProbit[varx])

In [14]:
# Drop vars
FullTeamProbit.drop(columns = ['HomeOff', 'AwayOff', 'Home_Offsides', 'Away_Offsides',
                              'HomeFouls', 'AwayFouls', 'HomeFld', 'AwayFld', 'url', 
                              'Home_Possession', 'Away_Possession',
                              'Home_Fouls_y', 'Away_Fouls_y',
                              'hometeam', 'awayteam'], inplace = True)

In [15]:
# Rename variables for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Score' : 'HomeGoals', 'Away_Score' : 'AwayGoals',
                'Home_Yellow_Cards' : 'HomeYellow', 'Away_Yellow_Cards' : 'AwayYellow',
                'Home_Red_Cards' : 'HomeRed', 'Away_Red_Cards' : 'AwayRed',
                'Home_Fouls_x' : 'HomeFouls', 'Away_Fouls_x' : 'AwayFouls',
                'Home_Fld' : 'HomeFld', 'Away_Fld' : 'AwayFld',
                'HomeOffs' : 'HomeOffsides', 'AwayOffs' : 'AwayOffsides',
                'Home_Team' : 'hometeam', 'Away_Team' : 'awayteam'})

In [16]:
# Order

In [17]:
# Define variables and rename
FullTeamProbit['Jornada'] = FullTeamProbit['Matchweek']

In [18]:
# Define Year variable
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

FullTeamProbit['Year'] = FullTeamProbit['Season'].map(season_to_year)

In [19]:
# Look for duplicated obs
FullTeamProbit['Aux'] = FullTeamProbit['Partido'] + FullTeamProbit['Season']

# Find duplicated obs
FullTeamProbit['Dupy'] = np.where((FullTeamProbit['Aux'].duplicated(keep = False)) == True,
                                 1, 0)
# Drop duplicated values
FullTeamProbit['Aux2'] = np.where( (FullTeamProbit['Dupy'] == 1) & (FullTeamProbit['HomeFld'].isna()), 1, 0 )

FullTeamProbit = FullTeamProbit[(FullTeamProbit['Aux2'] == 0)]

FullTeamProbit['Dupy'] = np.where((FullTeamProbit['Aux'].duplicated()) == True,
                                 1, 0)

FullTeamProbit = FullTeamProbit[(FullTeamProbit['Dupy'] == 0)]

# Drop aux variables
FullTeamProbit = FullTeamProbit.drop(columns = ['Aux', 'Dupy', 'Aux2', '_merge'])

In [20]:
# Merge data
Attendance9821R = pd.read_excel('Datasets/TRANSFERMKT/Attendance9821R.xlsx')
ODDS = pd.read_excel('Datasets/odds.xlsx')

# MERGE
FullTeamProbit = pd.merge(FullTeamProbit, Attendance9821R, on = ['Year', 'Jornada', 'hometeam'],
                          validate = 'many_to_many')

FullTeamProbit = pd.merge(FullTeamProbit, ODDS, on = ['Year', 'Partido'],
                         validate = 'many_to_many')

In [21]:
# Save data
FullTeamProbit.to_excel('Datasets/FullTeamProbit.xlsx')

In [22]:
# Keep relevant variables, reorder and sort
FullTeamProbit = FullTeamProbit[['Date_x', 'Matchweek', 'Year', 'hometeam', 'awayteam', 
                    'Partido', 'HomeGoals', 'AwayGoals', 'HomeFouls', 'AwayFouls',
                    'HomeFld', 'AwayFld', 'HomeYellow', 'AwayYellow', 'HomeRed', 'AwayRed',
                    'HomeOffsides', 'AwayOffsides', 'HomePenConc', 'AwayPenConc', 
                    'HomePenAtt', 'AwayPenAtt', 'FinalResult', 'HomeEquipo', 
                    'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo', 
                    'AwayAnnual Wages EUR', 'AwayMktValue', 'Attendance', 'IWH', 'IWD', 'IWA']]
# Reset index

In [23]:
# Rename for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'Attendance' : 'AttendanceTransfMkt',
                'hometeam' : 'HomeTeam', 'awayteam' : 'AwayTeam', 'Date_x' : 'Date'})

In [24]:
# Save data
FullTeamProbit.to_excel('Datasets/FullTeamProbit.xlsx')

# 

In [25]:
# Import data
Final1 = pd.read_excel('Datasets/FullTeamProbit.xlsx')

In [26]:
# Duplicate obs / Reshape
#Final1 = pd.concat([Final1, Final1], ignore_index = True)
#Final1['dupindicator'] = Final1.duplicated(keep='first').astype(int)

# Sort data
# this might not even be needed at the end lol

In [28]:
# Reshape the dataset from wide to long format
Final1 = pd.wide_to_long(Final1, stubnames=['Home', 'Away'], i=['Date', 'Partido'], j='team_type', sep='')
#Final1 = Final1.reset_index()

# theres something wrong with this

In [30]:
Final1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AwayTeam,HomePenConc,HomeGoals,HomePenAtt,AwayFld,HomeAnnual Wages EUR,Matchweek,HomeFld,IWD,HomeMktValue,...,FinalResult,HomeRed,AwayMktValue,AwayOffsides,HomeFouls,AwayFouls,AwayRed,AwayEquipo,Home,Away
Date,Partido,team_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1


In [27]:
Final1.to_excel('Datasets/Final1.xlsx')

In [29]:
xxxx# Rename columns to reflect the new long format
Final1.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'Goals', 'HomeFouls': 'Fouls',
                        'HomeYellow': 'Yellow', 'HomeRed': 'Red', 'HomeFld': 'Fld',
                        'HomeOffsides': 'Offsides', 'HomePenAtt': 'PenAtt', 'HomePenConc': 'PenConc',
                        'HomeAnnualWagesEUR': 'AnnualWagesEUR', 'HomeMktValue': 'MktValue',
                        'AwayTeam': 'Team', 'AwayGoals': 'Goals', 'AwayFouls': 'Fouls',
                        'AwayYellow': 'Yellow', 'AwayRed': 'Red', 'AwayFld': 'Fld',
                        'AwayOffsides': 'Offsides', 'AwayPenAtt': 'PenAtt', 'AwayPenConc': 'PenConc',
                        'AwayAnnualWagesEUR': 'AnnualWagesEUR', 'AwayMktValue': 'MktValue'}, inplace=True)

# Create the 'Local' indicator (1 for Home, 0 for Away)
Final1['Local'] = (Final1['team_type'] == 'Home').astype(int)

# Create the 'Win' variable
Final1['Win'] = ((Final1['FinalResult'] == 1) & (Final1['Local'] == 1)) | \
                 ((Final1['FinalResult'] == 3) & (Final1['Local'] == 0))
Final1['Win'] = Final1['Win'].astype(int)

# Sort data to ensure correct pairing of home and away teams
Final1.sort_values(by=['date', 'Partido', 'Local'], inplace=True)

# Calculate differences in market value and goals
Final1['DiffMktValue'] = Final1.groupby(['date', 'Partido'])['MktValue'].diff(periods=1)
Final1['DiffMktValue'] = Final1.groupby(['date', 'Partido'])['DiffMktValue'].transform(lambda x: x.iloc[0] if pd.isnull(x.iloc[0]) else -x)

Final1['GoalDiff'] = Final1.groupby(['date', 'Partido'])['Goals'].diff(periods=1)
Final1['GoalDiff'] = Final1.groupby(['date', 'Partido'])['GoalDiff'].transform(lambda x: x.iloc[0] if pd.isnull(x.iloc[0]) else -x)

# Create "Favor" and "Contra" variables for cards
Final1['YellowFavor'] = Final1.groupby(['date', 'Partido'])['Yellow'].transform(lambda x: x.iloc[0] if Final1['Local'].iloc[x.index].iloc[0] == 0 else x.iloc[1])
Final1['YellowContra'] = Final1.groupby(['date', 'Partido'])['Yellow'].transform(lambda x: x.iloc[0] if Final1['Local'].iloc[x.index].iloc[0] == 1 else x.iloc[1])

Final1['RedFavor'] = Final1.groupby(['date', 'Partido'])['Red'].transform(lambda x: x.iloc[0] if Final1['Local'].iloc[x.index].iloc[0] == 0 else x.iloc[1])
Final1['RedContra'] = Final1.groupby(['date', 'Partido'])['Red'].transform(lambda x: x.iloc[0] if Final1['Local'].iloc[x.index].iloc[0] == 1 else x.iloc[1])

# Display the first few rows of the transformed dataframe
print(Final1.head())

NameError: name 'xxxx' is not defined

In [None]:
# Handle missing data for Fouls, Fld, and Offsides by setting them to missing if the year is before 2005
vars = ['Fouls', 'Fld', 'Offsides']

for var in vars:
    Final1[var] = np.where((Final1['Year'] < 2005), float['nan'], Final1[var])

In [None]:
# Generate variable for difference in bookmaker probabilities (iwh - iwa) based on Local status
Final1['DifBookMkr'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwh'] - Final1['iwa'],
                                Final1['iwa'] - Final1['iwh'])

In [None]:
# Using the Buraimo methodology, generate variables for bookmaker probabilities adjusted by epsilon
Final1['SumBooker'] = Final1['iwa'] + Final1['iwd'] + Final1['iwh']
Final1['epsilon'] = Final1['SumBooker']-1

Final1['iwhe'] = Final1['iwh'] / Final1['epsilon']
Final1['iwde'] = Final1['iwd'] / Final1['epsilon']
Final1['iwae'] = Final1['iwa'] / Final1['epsilon']

In [None]:
# Generate difference in adjusted bookmaker probabilities based on Local status
Final1['DifBookMaker'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwhe'] - Final1['iwae'],
                                Final1['iwae'] - Final1['iwhe'])

In [None]:
# Generate the square of the quality proxy variables
variables = ['DifMktValue', 'DifBookMkr', 'DifBookMaker']

for var in variables:
    aux = 'Sq' + var
    
    Final1[aux] = Final1[var]^2

In [None]:
# Generate a variable to indicate if any payments were made during the years 2001-2017
Final1['TdePagos'] = np.where( (Final1['Year'] > 2000) & (Final1['Year'] < 2018),
                             1, 0)

# GENERATE TREATMENT VARIABLES

In [None]:
# Variable indicating if the team is Barcelona (Equipo == 6)
Final1['Barca'] = np.where((Final1['Equipo'] == 6), 1, 0)

# Variable indicating if the team is either Barcelona or Real Madrid (Equipo == 6 or 32)
Final1['BarcaRM'] = np.where((Final1['Equipo'] == 6) | (Final1['Equipo'] == 32),
                             1, 0)

# Generate variable indicating whether a club paid the CTA (1 for Barcelona during payment years)
Final1['Bribe'] = np.where( (Final1['Barca'] == 1) & ((2000 < Final1['Year'] < 2018)), 
                          1, 0)

In [None]:
Final1.to_excel('Datasets/Final1.xlsx')

# Attendance

In [None]:
# Once performed this once, there's no need to perform this again as it is time consuming
dataframes = {}

for i in range(1, 101):
    # Define the path file and file path
    filename = f'FTBLNP{i}.csv'
    filepath = os.path.join(r'Data\FootballLineups\Attendance', filename)

    # Import data
    AttendanceFTBLNP = pd.read_csv(filepath, encoding='ISO-8859-1')
    
    dataframes[filename] = AttendanceFTBLNP
    
#print(dataframes.keys())

In [None]:
# Concat dictionary entries in one dataframe
AttendanceFTBLNP = pd.concat(dataframes.values(), ignore_index = True)
AttendanceFTBLNP.reset_index(inplace = True)
AttendanceFTBLNP = AttendanceFTBLNP.drop(columns = 'Unnamed: 0')

In [None]:
AuxAttendance = AttendanceFTBLNP['Text'].str.split(' - ', expand = True)
AuxAttendance.reset_index(inplace = True)

In [None]:
AttendanceFTBLNP = pd.merge(AttendanceFTBLNP, AuxAttendance, on = 'index',
                           validate = 'one_to_one')

In [None]:
AttendanceFTBLNP = AttendanceFTBLNP[AttendanceFTBLNP[0] != 'character(0)']

AttendanceFTBLNP[1] = AttendanceFTBLNP[1].str.replace(' Matchday', 'La Liga (Matchweek')

In [None]:
rango = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

for i in rango:
    aux1 = i + ' '
    auxi = i + ')'
    auxs = ' ' + i + ' '
    
    AttendanceFTBLNP[1] = AttendanceFTBLNP[1].str.replace(aux1, auxi)
    AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace(auxs, '')

In [None]:
AttendanceFTBLNP[['hometeam', 'awayteam']] = AttendanceFTBLNP[0].str.split(':', n = 1, expand = True)

In [None]:
AttendanceFTBLNP['awayteam'] = AttendanceFTBLNP['awayteam'].str.replace('La Liga', '')
AttendanceFTBLNP['awayteam'] = AttendanceFTBLNP['awayteam'].str.replace('/', '')

for i in range(2001, 2023, 1):
    AttendanceFTBLNP['awayteam'] = AttendanceFTBLNP['awayteam'].str.replace(str(i), '')

AttendanceFTBLNP['awayteam'] = AttendanceFTBLNP['awayteam'].str.replace('  ', '')

In [None]:
AttendanceFTBLNP.to_excel('Datasets/AUXXX.xlsx')

In [None]:
# Generate Year data
AttendanceFTBLNP['Year'] = AttendanceFTBLNP[0].str[-10: -6]
AttendanceFTBLNP['Year'] = AttendanceFTBLNP['Year'].astype(int)

In [None]:
# Keep relevant variables
AttendanceFTBLNP = AttendanceFTBLNP[['index', 'URL', 0, 1, 3, 'Year', 'hometeam', 'awayteam']] # missign text1

In [None]:
# Generate Attendance data
AttendanceFTBLNP[3] = np.where(
    (AttendanceFTBLNP[3].str.contains('Attendance')) | (AttendanceFTBLNP[3].str.contains('Referee')),
    AttendanceFTBLNP[3], ''               
)

AttendanceFTBLNP[3] = AttendanceFTBLNP[3].str.replace('Attendance: ', '')
AttendanceFTBLNP[3] = AttendanceFTBLNP[3].str.replace('closed doors', '0')
AttendanceFTBLNP[3] = AttendanceFTBLNP[3].str.replace('[a-zA-Z]', '')

In [None]:
# Split Attendance by : and make adjustments
AttendanceFTBLNP[['AttendanceFTBLNP', '31']] = AttendanceFTBLNP[3].str.split(':', n = 1, expand = True)
AttendanceFTBLNP['AttendanceFTBLNP'] = AttendanceFTBLNP['AttendanceFTBLNP'].replace('', np.nan)
AttendanceFTBLNP['AttendanceFTBLNP'] = AttendanceFTBLNP['AttendanceFTBLNP'].astype(float)

# Keep relevant variables
AttendanceFTBLNP = AttendanceFTBLNP[['index', 'URL', 0, 1, 'AttendanceFTBLNP', 'Year', 'hometeam', 'awayteam']]

In [None]:
# Clean the name of teams
vars = ['hometeam', 'awayteam']
def standard_teams(df):
    replacements = {
        'Ã³' : 'ó', 'Ã±' : 'ñ', 'Ã\xa0' : 'à', 'Ã©' : 'é',
        'Ã\xad' : 'í'
    }
    for var in vars:
        for old_value, new_value in replacements.items():
            df[var] = df[var].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(AttendanceFTBLNP)

In [None]:
# Standarize the names of the teams
def standard_teams(df):
    replacements = {
        ' de Tarragona' : '', 'Hercules' : 'Hércules', 
        'Malaga' : 'Málaga', 'de Santander' : ' Santander',
        'Real Z' : 'Z', ' de Huelva' : '',
        ' CD' : '', 'Cadiz' : 'Cádiz',
        'Atletico Madrid' : 'Atlético Madrid', ' CF' : '',
        'Deportivo Alaves' : 'Alavés', ' de Barcelona' : '',
        'Real Betis' : 'Betis'
    }
    
    for var in vars:
        for old_value, new_value in replacements.items():
            df[var] = df[var].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(AttendanceFTBLNP)

In [None]:
# Encode data

# Define the mapping from matchweek names to numeric IDs
jornada_to_matchweek_id = {
    "La Liga (Matchweek 1)": 1, "La Liga (Matchweek 2)": 2,
    "La Liga (Matchweek 3)": 3, "La Liga (Matchweek 4)": 4,
    "La Liga (Matchweek 5)": 5, "La Liga (Matchweek 6)": 6,
    "La Liga (Matchweek 7)": 7, "La Liga (Matchweek 8)": 8,
    "La Liga (Matchweek 9)": 9, "La Liga (Matchweek 10)": 10,
    "La Liga (Matchweek 11)": 11, "La Liga (Matchweek 12)": 12,
    "La Liga (Matchweek 13)": 13, "La Liga (Matchweek 14)": 14,
    "La Liga (Matchweek 15)": 15, "La Liga (Matchweek 16)": 16,
    "La Liga (Matchweek 17)": 17, "La Liga (Matchweek 18)": 18,
    "La Liga (Matchweek 19)": 19, "La Liga (Matchweek 20)": 20,
    "La Liga (Matchweek 21)": 21, "La Liga (Matchweek 22)": 22,
    "La Liga (Matchweek 23)": 23, "La Liga (Matchweek 24)": 24,
    "La Liga (Matchweek 25)": 25, "La Liga (Matchweek 26)": 26,
    "La Liga (Matchweek 27)": 27, "La Liga (Matchweek 28)": 28,
    "La Liga (Matchweek 29)": 29, "La Liga (Matchweek 30)": 30,
    "La Liga (Matchweek 31)": 31, "La Liga (Matchweek 32)": 32,
    "La Liga (Matchweek 33)": 33, "La Liga (Matchweek 34)": 34,
    "La Liga (Matchweek 35)": 35, "La Liga (Matchweek 36)": 36,
    "La Liga (Matchweek 37)": 37, "La Liga (Matchweek 38)": 38
}

AttendanceFTBLNP.rename(columns = {1 : 'Jornada'}, inplace = True)

# Convert 'Jornada' string to numeric 'Matchweek'
AttendanceFTBLNP['Matchweek'] = AttendanceFTBLNP['Jornada'].map(jornada_to_matchweek_id)

In [None]:
# We will need to expand the data and duplicate obs
AttendanceFTBLNP = pd.concat([AttendanceFTBLNP, AttendanceFTBLNP], ignore_index = True)
AttendanceFTBLNP['dupindicator'] = AttendanceFTBLNP.duplicated(keep='first').astype(int)

In [None]:
AttendanceFTBLNP['hometeam'] = np.where(AttendanceFTBLNP['dupindicator'] == 1,
                                        '', AttendanceFTBLNP['hometeam'])

AttendanceFTBLNP['awayteam'] = np.where(AttendanceFTBLNP['dupindicator'] == 0,
                                        '', AttendanceFTBLNP['awayteam'])

AttendanceFTBLNP[0] = np.where( (AttendanceFTBLNP['hometeam'] == ''),
                               AttendanceFTBLNP['awayteam'],
                               AttendanceFTBLNP['hometeam'])

In [None]:
# Encode teams based on variable '0'
#AttendanceFTBLNP

In [None]:
# Adjust attendance variable
AttendanceFTBLNP['AttendanceFTBLNP'] = np.where( (AttendanceFTBLNP['Year'] == 2020) & (AttendanceFTBLNP['Matchweek'] < 37),
                             0, AttendanceFTBLNP['AttendanceFTBLNP'])

AttendanceFTBLNP['AttendanceFTBLNP'] = np.where( (AttendanceFTBLNP['Year'] == 2020) & ( AttendanceFTBLNP['AttendanceFTBLNP'].isna() ),
                             0, AttendanceFTBLNP['AttendanceFTBLNP'])

In [None]:
# Keep and order variables
AttendanceFTBLNP = AttendanceFTBLNP[AttendanceFTBLNP['Year', 'Matchweek', 'Equipo', 'AttendanceFTBLNP']]

In [None]:
# Save data
AttendanceFTBLNP.reset_index()
AttendanceFTBLNP.to_excel('Datasets/AUXXX.xlsx')

In [None]:
# Merge
FinalCorr = pd.merge(AttendanceFTBLNP, Final, on = ['Year', 'Matchweek', 'Equipo'],
                           validate = 'one_to_one', indicator = True)

In [None]:
# Import data to merge

# Merge
FinalCorr = pd.merge(FinalCorr, , on = ['Year', 'Matchweek', 'Equipo'],
                    validate = 'one_to_one', indicator = True)

In [None]:
# Minor adjustments
FinalCorr['YellowContra'] = np.where(( FinalCorr[] == 'both' ),
                                     FinalCorr['YellowContra'] - FinalCorr['SecYCrdHOME'],
                                      FinalCorr['YellowContra'] )

FinalCorr['YellowContra'] = np.where(( FinalCorr[] == 'both' ),
                                     FinalCorr['YellowContra'] - FinalCorr['SecYCrdAWAY'],
                                      FinalCorr['YellowContra'] )

# Might need to readjust

In [None]:
# Drop aux variables

In [None]:
# Order variables

In [None]:
# Replace data

In [None]:
# Drop variables

In [None]:
# Save
FinalCorr.reset_index()
FinalCorr.to_excel('Datasets/FinalCorr.xlsx')