In [2]:
# Import libraries
import pandas as pd
import os
import numpy as np

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Import data
PenaltiesConceded = pd.read_excel('Datasets/PenaltiesConceded.xlsx')
Merge1 = pd.read_excel('Datasets/Merge1.xlsx', dtype = {'Season' : str})

# Visualize columns
print(PenaltiesConceded.columns)
print(Merge1.columns)

Index(['Year', 'Matchday', 'Home_Team', 'Away_Team', 'HomeTotPKcon',
       'AwayTotPKcon', 'HomeTotPKatt', 'AwayTotPKatt', 'Partido'],
      dtype='object')
Index(['Year', 'Matchweek', 'Season', 'Home_Team', 'Away_Team', 'Partido',
       'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'HomeOff', 'AwayOff',
       'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 'HomePKatt', 'AwayPKatt',
       'Date', 'Home_Fouls', 'Away_Fouls', 'AwayFld', 'HomeFld', 'FinalResult',
       'HomeEquipo', 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo',
       'AwayAnnual Wages EUR', 'AwayMktValue'],
      dtype='object')


In [3]:
print(PenaltiesConceded['Year'].unique())
print(Merge1['Season'].unique())

[2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 1999 2000 2001 1998
 2003 2002]
['9899' '9900' '0001' '0102' '0203' '0304' '0405' '0506' '0607' '0708'
 '0809' '0910' '1011' '1112' '1213' '1314' '1415' '1516' '1617' '1718'
 '1819' '1920' '2021' '2122']


In [4]:
# Merge data
FullTeamProbit = pd.merge(PenaltiesConceded, Merge1, on = ['Partido', 'Year'],
                         indicator = True, validate = 'many_to_many', how = 'outer')

FullTeamProbit = FullTeamProbit.drop(columns = ['Home_Team_x', 'Away_Team_x'])
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Team_y' : 'Home_Team', 'Away_Team_y' : 'Away_Team'})

FullTeamProbit = FullTeamProbit.sort_values(by = ['Year', 'Matchweek'])

In [5]:
# Set penalty conceded and penalty attempted variables to 0 for years before 2014
vars = ['HomePKcon', 'AwayPKcon', 'HomePKatt', 'AwayPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit['Year'] < 2014), 
                             0, FullTeamProbit[var])
    
# Replace missing values for total penalties conceded and attempted with 0
vars = ['HomeTotPKcon', 'AwayTotPKcon', 'HomeTotPKatt', 'AwayTotPKatt']

for var in vars:
    FullTeamProbit[var] = np.where((FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [6]:
# Generate combined penalty concession and attempt variables for home and away teams
FullTeamProbit['HomePenConc'] = FullTeamProbit['HomeTotPKcon'] + FullTeamProbit['HomePKcon']
FullTeamProbit['AwayPenConc'] = FullTeamProbit['AwayTotPKcon'] + FullTeamProbit['AwayPKcon']
FullTeamProbit['HomePenAtt'] = FullTeamProbit['HomeTotPKatt'] + FullTeamProbit['HomePKatt']
FullTeamProbit['AwayPenAtt'] = FullTeamProbit['AwayTotPKatt'] + FullTeamProbit['AwayPKatt']

In [7]:
FullTeamProbit.columns

Index(['Year', 'Matchday', 'HomeTotPKcon', 'AwayTotPKcon', 'HomeTotPKatt',
       'AwayTotPKatt', 'Partido', 'Matchweek', 'Season', 'Home_Team',
       'Away_Team', 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards',
       'Away_Score', 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomeOff',
       'AwayOff', 'HomePKcon', 'AwayPKcon', 'HomePK', 'AwayPK', 'HomePKatt',
       'AwayPKatt', 'Date', 'Home_Fouls', 'Away_Fouls', 'AwayFld', 'HomeFld',
       'FinalResult', 'HomeEquipo', 'HomeAnnual Wages EUR', 'HomeMktValue',
       'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue', '_merge',
       'HomePenConc', 'AwayPenConc', 'HomePenAtt', 'AwayPenAtt'],
      dtype='object')

In [8]:
# Keep relevant variables and order
FullTeamProbit = FullTeamProbit[['Home_Team', 'Away_Team', 'Partido', 'Matchweek', 'Season', 
                                 'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
                                 'Away_Yellow_Cards', 'Away_Red_Cards', 'HomePenConc', 'AwayPenConc',
                                 'HomePenAtt', 'AwayPenAtt', 'HomeOff', 'AwayOff', 'Date', 'Home_Fouls',
                                 'Away_Fouls', 'AwayFld', 'HomeFld', 'FinalResult', 'HomeEquipo',
                                 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo', 'AwayAnnual Wages EUR', 'AwayMktValue']]

In [9]:
# Import data
MergeFootballLineups = pd.read_excel('Datasets/FootballLineups/MergeFootballLineups.xlsx', 
                                     dtype = {'Season' : str})

In [10]:
# Merge
FullTeamProbit = pd.merge(FullTeamProbit, MergeFootballLineups, on = ['Partido', 'Season'],
                         how = 'outer', validate = 'many_to_many', indicator = True)

In [13]:
FullTeamProbit.columns

Index(['Home_Team', 'Away_Team', 'Partido', 'Matchweek', 'Season',
       'Home_Score', 'Home_Yellow_Cards', 'Home_Red_Cards', 'Away_Score',
       'Away_Yellow_Cards', 'Away_Red_Cards', 'HomePenConc', 'AwayPenConc',
       'HomePenAtt', 'AwayPenAtt', 'HomeOff', 'AwayOff', 'Date',
       'Home_Fouls_x', 'Away_Fouls_x', 'AwayFld', 'HomeFld', 'FinalResult',
       'HomeEquipo', 'HomeAnnual Wages EUR', 'HomeMktValue', 'AwayEquipo',
       'AwayAnnual Wages EUR', 'AwayMktValue', 'url', 'hometeam', 'awayteam',
       'Home_Fouls_y', 'Home_Offsides', 'Home_Possession', 'Away_Fouls_y',
       'Away_Offsides', 'Away_Possession', 'Home_Fld', 'Away_Fld', 'HomeFouls',
       'AwayFouls', '_merge'],
      dtype='object')

In [None]:
# Replace missing values for various game statistics with 0
variables = []
for var in variables:
    FullTeamProbit[var] = np.where( (FullTeamProbit[var].isna()),
                                  0, FullTeamProbit[var])

In [None]:
# Generate total offsides variables by summing individual offsides counts
FullTeamProbit['HomeOffs'] = FullTeamProbit['Home_Off'] + FullTeamProbit['HomeOffsides']
FullTeamProbit['AwayOffs'] = FullTeamProbit['Away_Off'] + FullTeamProbit['AwayOffsides']

In [None]:
# Drop vars

In [None]:
# Rename variables for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Score' : 'HomeGoals', 'Away_Score' : 'AwayGoals',
                'Home_Yellow_Cards' : 'HomeYellow', 'Away_Yellow_Cards' : 'AwayYellow',
                'Home_Red_Cards' : 'HomeRed', 'Away_Red_Cards' : 'AwayRed',
                'Home_Fouls' : 'HomeFouls', 'Away_Fouls' : 'AwayFouls',
                'Home_Fld' : 'HomeFld', 'Away_Fld' : 'AwayFld',
                'HomeOffs' : 'HomeOffsides', 'AwayOffs' : 'AwayOffsides'})

In [None]:
# Order

In [None]:
# Define variables and rename
FullTeamProbit['Jornada'] = FullTeamProbit['Matchweek']

FullTeamProbit = FullTeamProbit.rename(columns = {'Home_Team' : 'hometeam', 
                                                  'Away_Team' : 'awayteam'})

In [None]:
# Define Year variable
season_to_year = {
    '9899': 1998, '9900': 1999, '0001': 2000, '0102': 2001, '0203': 2002, 
    '0304': 2003, '0405': 2004, '0506': 2005, '0607': 2006, '0708': 2007, 
    '0809': 2008, '0910': 2009, '1011': 2010, '1112': 2011, '1213': 2012, 
    '1314': 2013, '1415': 2014, '1516': 2015, '1617': 2016, '1718': 2017, 
    '1819': 2018, '1920': 2019, '2021': 2020, '2122': 2021
}

FullTeamProbit['Year'] = FullTeamProbit['Season'].map(season_to_year)

In [None]:
#before dropping it, display the obs
#FullTeamProbit = FullTeamProbit[(~FullTeamProbit['hometeam'].isna())]

In [11]:
# Save data
FullTeamProbit.to_excel('FullTeamProbit.xlsx')

In [None]:
# Merge data
Attendance9821R = pd.read_excel('Datasets/TRANSFERMKT/Attendance9821R.xlsx')
ODDS = pd.read_excel('Datasets/odds.xlsx')

###MERGE

In [None]:
# Keep relevant variables after merging and reorder and sort


In [None]:
# Rename variables for clarity
FullTeamProbit = FullTeamProbit.rename(columns = {'hometeam' : 'HomeTeam', 
                                    'awayteam' : 'AwayTeam', 'Attendance' : 'AttendanceTransfMkt'})

In [None]:
# Save data
FullTeamProbit.to_excel('Datasets/FullTeamProbit.xlsx')

# 

In [None]:
# Import data
Final1 = pd.read_excel('Datasets/FullTeamProbit.xlsx')

In [None]:
# Duplicate obs / Reshape

In [None]:
# Handle missing data for Fouls, Fld, and Offsides by setting them to missing if the year is before 2005
vars = ['Fouls', 'Fld', 'Offsides']

for var in vars:
    Final1[var] = np.where((Final1['Year'] < 2005), float['nan'], Final1[var])

In [None]:
# Generate variable for difference in bookmaker probabilities (iwh - iwa) based on Local status
Final1['DifBookMkr'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwh'] - Final1['iwa'],
                                Final1['iwa'] - Final1['iwh'])

In [None]:
# Using the Buraimo methodology, generate variables for bookmaker probabilities adjusted by epsilon
Final1['SumBooker'] = Final1['iwa'] + Final1['iwd'] + Final1['iwh']
Final1['epsilon'] = Final1['SumBooker']-1

Final1['iwhe'] = Final1['iwh'] / Final1['epsilon']
Final1['iwde'] = Final1['iwd'] / Final1['epsilon']
Final1['iwae'] = Final1['iwa'] / Final1['epsilon']

In [None]:
# Generate difference in adjusted bookmaker probabilities based on Local status
Final1['DifBookMaker'] = np.where( (DifBookMkr['Local'] == 1), 
                                Final1['iwhe'] - Final1['iwae'],
                                Final1['iwae'] - Final1['iwhe'])

In [1]:
# Generate the square of the quality proxy variables
variables = ['DifMktValue', 'DifBookMkr', 'DifBookMaker']

for var in variables:
    aux = 'Sq' + var
    
    Final1[aux] = Final1[var]^2

SqDifMktValue
SqDifBookMkr
SqDifBookMaker


In [None]:
# Generate a variable to indicate if any payments were made during the years 2001-2017
Final1['TdePagos'] = np.where( (Final1['Year'] > 2000) & (Final1['Year'] < 2018),
                             1, 0)

# GENERATE TREATMENT VARIABLES

In [None]:
# Variable indicating if the team is Barcelona (Equipo == 6)
Final1['Barca'] = np.where((Final1['Equipo'] == 6), 1, 0)

# Variable indicating if the team is either Barcelona or Real Madrid (Equipo == 6 or 32)
Final1['BarcaRM'] = np.where((Final1['Equipo'] == 6) | (Final1['Equipo'] == 32),
                             1, 0)

# Generate variable indicating whether a club paid the CTA (1 for Barcelona during payment years)
Final1['Bribe'] = np.where( (Final1['Barca'] == 1) & ((2000 < Final1['Year'] < 2018)), 
                          1, 0)

In [None]:
Final1.to_excel('Datasets/Final1.xlsx')

# Attendance

In [79]:
# Once performed this once, there's no need to perform this again as it is time consuming

dataframes = {}

for i in range(1, 101):
    # Define the path file and file path
    filename = f'FTBLNP{i}.csv'
    filepath = os.path.join(r'Data\FootballLineups\Attendance', filename)

    # Import data
    AttendanceFTBLNP = pd.read_csv(filepath, encoding='ISO-8859-1')
    
    dataframes[filename] = AttendanceFTBLNP
    
#print(dataframes.keys())

In [80]:
# Concat dictionary entries in one dataframe
AttendanceFTBLNP = pd.concat(dataframes.values(), ignore_index = True)

len(AttendanceFTBLNP)

8007

In [81]:
AuxAttendance = AttendanceFTBLNP['Text'].str.split(' - ', expand = True)

check how you can split it and adding the new columns to the og dataset

In [82]:
AuxAttendance = AuxAttendance[AuxAttendance[0] != 'character(0)']

AuxAttendance[1] = AuxAttendance[1].str.replace(' Matchday', 'La Liga (Matchday')

In [83]:
rango = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

for i in rango:
    aux1 = i + ' '
    auxi = i + ')'
    auxs = ' ' + i + ' '
    AuxAttendance[1] = AuxAttendance[1].str.replace(aux1, auxi)
    AuxAttendance[0] = AuxAttendance[0].str.replace(auxs, '')

In [None]:
# Generate Yeat data

In [84]:
# Generate Attendance data
AuxAttendance[3] = np.where( (~AuxAttendance[3].str.contains('Attendance')) | 
                              (~AuxAttendance[3].str.contains('Referee')), 
                               '', AuxAttendance[3] )

CHANGE CONDITIONS, NOT WORKING

AuxAttendance[3] = AuxAttendance[3].str.replace('Attendance: ', '')

#JUST KEEP DIGITS
AuxAttendance[3] = AuxAttendance[3].str.replace(+t, '')

AuxAttendance[3].astype(int)

# RENAME

TypeError: bad operand type for unary ~: 'NoneType'

In [78]:
AuxAttendance.to_excel('Datasets/AUXXX.xlsx')

In [29]:
AttendanceFTBLNP.to_excel('Datasets/AttendanceFTBLNP.xlsx')

PermissionError: [Errno 13] Permission denied: 'Datasets/AttendanceFTBLNP.xlsx'