In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [None]:
# Import data
PenaltiesConceded = pd.read_excel('Datasets/PenaltiesConceded.xlsx')
Merge1 = pd.read_excel('Datasets/Merge1.xlsx', dtype = {'Season' : str})

# Visualize columns
print(PenaltiesConceded.columns)
print(Merge1.columns)

In [None]:
print(PenaltiesConceded['Year'].unique())
print(Merge1['Season'].unique())

In [None]:
# Merge data
FullTeamProbit = pd.merge(PenaltiesConceded, Merge1, on = ['Partido', 'Year'],
                         indicator = True, validate = 'many_to_many', how = 'outer')

FullTeamProbit = FullTeamProbit.drop(columns = ['Home_Team_x', 'Away_Team_x'])
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Team_y' : 'Home_Team', 'Away_Team_y' : 'Away_Team'})

FullTeamProbit = FullTeamProbit.sort_values(by = ['Year', 'Matchweek'])

In [None]:
# Set penalty conceded and penalty attempted variables to 0 for years before 2014
vars = ['HomePKcon', 'AwayPKcon', 'HomePKatt', 'AwayPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit['Year'] < 2014), 
                             0, FullTeamProbit[var])
    
# Replace missing values for total penalties conceded and attempted with 0
vars = ['HomeTotPKcon', 'AwayTotPKcon', 'HomeTotPKatt', 'AwayTotPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [None]:
# Generate combined penalty concession and attempt variables for home and away teams
FullTeamProbit['HomePenConc'] = FullTeamProbit['HomeTotPKcon'] + FullTeamProbit['HomePKcon']
FullTeamProbit['AwayPenConc'] = FullTeamProbit['AwayTotPKcon'] + FullTeamProbit['AwayPKcon']
FullTeamProbit['HomePenAtt'] = FullTeamProbit['HomeTotPKatt'] + FullTeamProbit['HomePKatt']
FullTeamProbit['AwayPenAtt'] = FullTeamProbit['AwayTotPKatt'] + FullTeamProbit['AwayPKatt']

In [None]:
FullTeamProbit.columns

In [None]:
# Keep relevant variables and order
FullTeamProbit = FullTeamProbit[['Home_Team', 'Away_Team', 'Partido', 'Matchweek', 'Season', 
                                 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
                                 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomePenConc', 'AwayPenConc',
                                 'HomePenAtt', 'AwayPenAtt', 'HomeOff', 'AwayOff', 'Date', 'Home_Fouls',
                                 'Away_Fouls', 'AwayFld', 'HomeFld', 'FinalResult', 'HomeEquipo',
                                 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue']]

In [None]:
# Import data
MergeFootballLineups = pd.read_excel('Datasets/FootballLineups/MergeFootballLineups.xlsx', 
                                     dtype = {'Season' : str})

In [None]:
# Merge
FullTeamProbit = pd.merge(FullTeamProbit, MergeFootballLineups, on = ['Partido', 'Season'],
                         how = 'outer', validate = 'many_to_many', indicator = True)

In [None]:
FullTeamProbit.columns

In [None]:
# Replace missing values for various game statistics with 0
variables = []
for var in variables:
    FullTeamProbit[var] = np.where( (FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [None]:
# Generate total offsides variables by summing individual offsides counts
FullTeamProbit['HomeOffs'] = FullTeamProbit['Home_Off'] + FullTeamProbit['HomeOffsides']
FullTeamProbit['AwayOffs'] = FullTeamProbit['Away_Off'] + FullTeamProbit['AwayOffsides']

In [None]:
# Drop vars

In [None]:
# Rename variables for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Score' : 'HomeGoals', 'Away_Score' : 'AwayGoals',
                'Home_Yellow_Cards' : 'HomeYellow', 'Away_Yellow_Cards' : 'AwayYellow',
                'Home_Red_Cards' : 'HomeRed', 'Away_Red_Cards' : 'AwayRed',
                'Home_Fouls' : 'HomeFouls', 'Away_Fouls' : 'AwayFouls',
                'Home_Fld' : 'HomeFld', 'Away_Fld' : 'AwayFld',
                'HomeOffs' : 'HomeOffsides', 'AwayOffs' : 'AwayOffsides'})

In [None]:
# Order

In [None]:
# Define variables and rename
FullTeamProbit['Jornada'] = FullTeamProbit['Matchweek']

FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Team' : 'hometeam', 
                                                  'Away_Team' : 'awayteam'})

In [None]:
# Define Year variable
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

FullTeamProbit['Year'] = FullTeamProbit['Season'].map(season_to_year)

In [None]:
#before dropping it, display the obs
#FullTeamProbit = FullTeamProbit[(~FullTeamProbit['hometeam'].isna())]

In [None]:
# Save data
FullTeamProbit.to_excel('FullTeamProbit.xlsx')

In [None]:
# Merge data
Attendance9821R = pd.read_excel('Datasets/TRANSFERMKT/Attendance9821R.xlsx')
ODDS = pd.read_excel('Datasets/odds.xlsx')

###MERGE

In [None]:
# Keep relevant variables after merging and reorder and sort


In [None]:
# Rename variables for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'hometeam' : 'HomeTeam', 
                                    'awayteam' : 'AwayTeam', 'Attendance' : 'AttendanceTransfMkt'})

In [None]:
# Save data
FullTeamProbit.to_excel('Datasets/FullTeamProbit.xlsx')

# 

In [None]:
# Import data
Final1 = pd.read_excel('Datasets/FullTeamProbit.xlsx')

In [None]:
# Duplicate obs / Reshape

In [None]:
# Handle missing data for Fouls, Fld, and Offsides by setting them to missing if the year is before 2005
vars = ['Fouls', 'Fld', 'Offsides']

for var in vars:
    Final1[var] = np.where((Final1['Year'] < 2005), float['nan'], Final1[var])

In [None]:
# Generate variable for difference in bookmaker probabilities (iwh - iwa) based on Local status
Final1['DifBookMkr'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwh'] - Final1['iwa'],
                                Final1['iwa'] - Final1['iwh'])

In [None]:
# Using the Buraimo methodology, generate variables for bookmaker probabilities adjusted by epsilon
Final1['SumBooker'] = Final1['iwa'] + Final1['iwd'] + Final1['iwh']
Final1['epsilon'] = Final1['SumBooker']-1

Final1['iwhe'] = Final1['iwh'] / Final1['epsilon']
Final1['iwde'] = Final1['iwd'] / Final1['epsilon']
Final1['iwae'] = Final1['iwa'] / Final1['epsilon']

In [None]:
# Generate difference in adjusted bookmaker probabilities based on Local status
Final1['DifBookMaker'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwhe'] - Final1['iwae'],
                                Final1['iwae'] - Final1['iwhe'])

In [None]:
# Generate the square of the quality proxy variables
variables = ['DifMktValue', 'DifBookMkr', 'DifBookMaker']

for var in variables:
    aux = 'Sq' + var
    
    Final1[aux] = Final1[var]^2

In [None]:
# Generate a variable to indicate if any payments were made during the years 2001-2017
Final1['TdePagos'] = np.where( (Final1['Year'] > 2000) & (Final1['Year'] < 2018),
                             1, 0)

# GENERATE TREATMENT VARIABLES

In [None]:
# Variable indicating if the team is Barcelona (Equipo == 6)
Final1['Barca'] = np.where((Final1['Equipo'] == 6), 1, 0)

# Variable indicating if the team is either Barcelona or Real Madrid (Equipo == 6 or 32)
Final1['BarcaRM'] = np.where((Final1['Equipo'] == 6) | (Final1['Equipo'] == 32),
                             1, 0)

# Generate variable indicating whether a club paid the CTA (1 for Barcelona during payment years)
Final1['Bribe'] = np.where( (Final1['Barca'] == 1) & ((2000 < Final1['Year'] < 2018)), 
                          1, 0)

In [None]:
Final1.to_excel('Datasets/Final1.xlsx')

# Attendance

In [25]:
# Once performed this once, there's no need to perform this again as it is time consuming

dataframes = {}

for i in range(1, 101):
    # Define the path file and file path
    filename = f'FTBLNP{i}.csv'
    filepath = os.path.join(r'Data\FootballLineups\Attendance', filename)

    # Import data
    AttendanceFTBLNP = pd.read_csv(filepath, encoding='ISO-8859-1')
    
    dataframes[filename] = AttendanceFTBLNP
    
#print(dataframes.keys())

In [26]:
# Concat dictionary entries in one dataframe
AttendanceFTBLNP = pd.concat(dataframes.values(), ignore_index = True)
AttendanceFTBLNP.reset_index(inplace = True)
AttendanceFTBLNP = AttendanceFTBLNP.drop(columns = 'Unnamed: 0')
len(AttendanceFTBLNP)

8007

In [28]:
AuxAttendance = AttendanceFTBLNP['Text'].str.split(' - ', expand = True)
AuxAttendance.reset_index(inplace = True)

In [30]:
AttendanceFTBLNP = pd.merge(AttendanceFTBLNP, AuxAttendance, on = 'index',
                           validate = 'one_to_one')

In [32]:
AttendanceFTBLNP = AttendanceFTBLNP[AttendanceFTBLNP[0] != 'character(0)']

AttendanceFTBLNP[1] = AttendanceFTBLNP[1].str.replace(' Matchday', 'La Liga (Matchday')

In [33]:
rango = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

for i in rango:
    aux1 = i + ' '
    auxi = i + ')'
    auxs = ' ' + i + ' '
    
    AttendanceFTBLNP[1] = AttendanceFTBLNP[1].str.replace(aux1, auxi)
    AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace(auxs, '')

In [43]:
# Generate Year data
AttendanceFTBLNP['Year'] = AttendanceFTBLNP[0].str[-10: -6]
AttendanceFTBLNP['Year'].astype(int)

0       2021
1       2021
2       2021
3       2021
4       2021
        ... 
8002    2019
8003    2019
8004    2019
8005    2019
8006    2019
Name: Year, Length: 7980, dtype: int32

In [45]:
# Generate Attendance data
AuxAttendance[3] = np.where( (~AuxAttendance[3].str.contains('Attendance')) | 
                              (~AuxAttendance[3].str.contains('Referee')), 
                               '', AuxAttendance[3] )

#CHANGE CONDITIONS, NOT WORKING

AuxAttendance[3] = AuxAttendance[3].str.replace('Attendance: ', '')

#JUST KEEP DIGITS
AuxAttendance[3] = AuxAttendance[3].str.replace(+t, '')

AuxAttendance[3].astype(int)

# RENAME

TypeError: bad operand type for unary ~: 'NoneType'

In [74]:
# Clean and define team variable 
AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace('La Liga ', '')
AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace(r'\d', '')
AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace('/', '')

# Teams
Teams = AttendanceFTBLNP[0].str.split(':', expand = True)
Teams.reset_index()

  AttendanceFTBLNP[0] = AttendanceFTBLNP[0].str.replace(r'\d', '')


Unnamed: 0,index,0,1
0,0,Barcelona,Villarreal
1,1,Real Sociedad,Atletico Madrid
2,2,Sevilla,Athletic Club
3,3,Deportivo Alaves,Cadiz
4,4,Granada CF,Espanyol de Barcelona
...,...,...,...
7975,8002,Getafe,Valencia
7976,8003,Levante,LeganÃ©s
7977,8004,Deportivo Alaves,Eibar
7978,8005,Barcelona,Levante


In [78]:
# Standarize the names of the teams
Teams[0].unique()


'Ã³' : 'ó',
'Ã±' : 'ñ',
'Ã\xa0' : 'à',
'Ã©' : 'é'


array(['Barcelona', 'Real Sociedad', 'Sevilla', 'Deportivo Alaves',
       'Granada CF', 'Osasuna', 'Elche CF', 'Valencia', 'Real Madrid',
       'Rayo Vallecano', 'Athletic Club', 'Atletico Madrid', 'Real Betis',
       'Celta Vigo', 'Mallorca', 'Getafe', 'Cadiz', 'Levante',
       'Villarreal', 'Espanyol de Barcelona', 'Eibar', 'Huesca',
       'Real Valladolid', 'LeganÃ©s', 'Girona', 'Malaga', 'Las Palmas',
       'Deportivo La CoruÃ±a', 'Sporting GijÃ³n', 'Real Madrid  ',
       'AlmerÃ\xada', 'CÃ³rdoba CF', 'Real Zaragoza',
       'Racing de Santander', 'Hercules', 'Xerez CD', 'Tenerife',
       'Numancia', 'Recreativo de Huelva', 'Real Murcia',
       'GimnÃ\xa0stic de Tarragona', 'Albacete'], dtype=object)

In [77]:
# Merge data to include the teams
AttendanceFTBLNP = pd.merge(AttendanceFTBLNP, Teams, on = 'index',
                           validate = 'one_to_one', indicator = True)

KeyError: 'index'

In [None]:
# We will need to expand the data and duplicate obs

In [None]:
# Encode data

In [None]:
# Adjust attendance variable
AttendanceFTBLNP[] = np.where( (AttendanceFTBLNP['Year'] == 2020) & (AttendanceFTBLNP['Matchweek'] < 37),
                             0, AttendanceFTBLNP[])

AttendanceFTBLNP[] = np.where( (AttendanceFTBLNP['Year'] == 2020) & ( AttendanceFTBLNP[].isna() ),
                             0, AttendanceFTBLNP[])

In [None]:
# Keep and order variables
AttendanceFTBLNP = AttendanceFTBLNP[AttendanceFTBLNP['Year', 'Matchweek', 'Equipo', AttendanceFTBLNPxxx]]

In [None]:
# Merge with other datasets

In [None]:
# Minor adjustments
FinalCorr['YellowContra'] = np.where(( FinalCorr[] == 'both' ),
                                     FinalCorr['YellowContra'] - FinalCorr['SecYCrdHOME'],
                                      FinalCorr['YellowContra'] )

FinalCorr['YellowContra'] = np.where(( FinalCorr[] == 'both' ),
                                     FinalCorr['YellowContra'] - FinalCorr['SecYCrdAWAY'],
                                      FinalCorr['YellowContra'] )

In [None]:
FinalCorr.reset_index()

In [60]:
AttendanceFTBLNP.to_excel('Datasets/AttendanceFTBLNP.xlsx')