In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Import data, just one year
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506', 
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314', 
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

stat = ['STANDARD', 'PORTERIA', 'EXTRA', 'REGULAR']

locvis = ['SQ', 'VS']

# Loop to create empty dictionaries dynamically
for s in stat:
    for lv in locvis:
        # Dynamically generate the variable name and assign an empty dictionary
        dict_name = f"{s.lower()}_{lv}"
        globals()[dict_name] = {}

# Iterate over the list of years, stats and home status
for styp in stat:
    for i in time:
        for local in locvis:
            
            # Define the filepath for the file to be imported
            filename = f'{styp} {local} {i}.xlsx'
            typecat = f'{styp}'            
            filepath = os.path.join(r'Data\FBREF', typecat, filename)  # Use os.path.join to construct the path
            
            #print(filepath)
            
            # Omit the case PORTERIA & 9899 as that data doesn't exist
            if i == '9899' and styp == 'PORTERIA':
                continue
            
            ########################
            if styp == 'STANDARD':
                stdd = pd.read_excel(filepath, header = 1)
                stdd = stdd[['Equipo', 'PJ', 'TP', 'TPint', 'TA', 'TR']]
                stdd = stdd.copy()
                stdd.loc[:, 'Season'] = f'{i}'
                
                
                if local == 'SQ':
                    standard_SQ[filename] = stdd
                    standard_SQ[filename].rename(columns = {'TP' : 'PENEXEC_favor', 'TPint' : 'TPint_favor', 
                                               'TA' : 'TA_contra', 'TR' : 'TR_contra'}, 
                                    inplace = True)
                else:
                    standard_VS[filename] = stdd
                    standard_VS[filename].rename(columns = {'TP' : 'PENEXEC_contra', 'TPint' : 'TPint_contra',
                                               'TA' : 'TA_favor', 'TR' : 'TR_favor'}, 
                                    inplace = True)
            
            ########################
            if styp == 'PORTERIA':
                pdd = pd.read_excel(filepath, header = 1)
                pdd = pdd[['Equipo', 'TPint', 'PD', 'Salvadas', 'PC']]
                pdd = pdd.copy()
                pdd.loc[:, 'Season'] = f'{i}'
                     
                if local == 'SQ':
                    porteria_SQ[filename] = pdd
                    porteria_SQ[filename].rename(columns = {'TPint' : 'TPint_contra', 'PD' : 'PENEXEC_contra', 
                                               'Salvadas' : 'PDet_contra', 'PC' : 'PFail_contra'}, 
                                    inplace = True)
                else:
                    porteria_VS[filename] = pdd
                    porteria_VS[filename].rename(columns = {'TPint' : 'TPint_favor', 'PD' : 'PENEXEC_favor', 
                                               'Salvadas' : 'PDet_favor', 'PC' : 'PFail_favor'}, 
                                    inplace = True)
                
            ########################
            if styp == 'EXTRA':
                edd = pd.read_excel(filepath, header = 1)
                edd = edd[['Equipo', 'TA', 'TR', '2a amarilla', 'Fls', 'FR', 'PA']]
                edd = edd.copy()
                edd.loc[:, 'Season'] = f'{i}'

                if local == 'SQ':
                    extra_SQ[filename] = edd
                    extra_SQ[filename].rename(columns = {'TA' : 'TA_contra', 'TR' : 'TR_contra', 
                                            '2a amarilla' : 'TA2_contra', 'Fls' : 'Fls_cometidasPor',
                                            'FR' : 'Fls_recibidasContra', 'PA' : 'Offside_contra'}, 
                                 inplace = True)
                else:
                    extra_VS[filename] = edd
                    extra_VS[filename].rename(columns = {'TA' : 'TA_favor', 'TR' : 'TR_favor', 
                                            '2a amarilla' : 'TA2_favor', 'Fls' : 'Fls_cometidasContra',
                                            'FR' : 'Fls_recibidasPor', 'PA' : 'Offside_favor'}, 
                                 inplace = True)
                    
            ########################
            if styp == 'REGULAR':
                rdd = pd.read_excel(filepath)
                rdd = rdd[['RL', 'Equipo', 'PJ', 'PG', 'PE', 'PP', 'DG']]
                rdd = rdd.copy()
                rdd.loc[:, 'Season'] = f'{i}'
                
                if local == 'SQ':
                    regular_SQ[filename] = rdd
                else:
                    regular_VS[filename] = rdd
                
                

In [3]:
# Loop to Concatenate all the keys in the dictionaries in just one dictionary
for s in stat:
    for lv in locvis:
        # Generate the dictionary name dynamically
        dict_name = f"{s.lower()}_{lv}"
        
        # Retrieve the actual dictionary from globals()
        data_dict = globals()[dict_name]
        
        # Concatenate the values of the dictionary (assuming the dictionary holds DataFrames)
        globals()[dict_name] = pd.concat(data_dict.values(), ignore_index=True)

In [4]:
# Function to clean the names of the teams
def clean_equipo_column(df):
    replacements = {
        'Ã¡': 'á', 'Ã ': 'à', 'Ã©': 'é', 'Ã­': 'í', 'Ã³': 'ó', 
        'Ã±': 'ñ', 'GimnÃ stic': 'Gimnàstic',
        'Deportivo La Coruña': 'La Coruña', 'Real Betis': 'Betis',
        'vs. ': ''
    }
    
    for old_value, new_value in replacements.items():
        df['Equipo'] = df['Equipo'].str.replace(old_value, new_value, regex = True)

# Apply the function to the dataframes
dataframes = [standard_SQ, porteria_SQ, extra_SQ, regular_SQ, standard_VS, porteria_VS, extra_VS, regular_VS]
for df in dataframes:
    clean_equipo_column(df)

In [5]:
def merge_stats(dataframes):
    # Start with the first DataFrame in the list
    merged_df = dataframes[0]
    
    # Merge with the remaining DataFrames
    for df in dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on=['Equipo', 'Season'], how = "outer")
    
    return merged_df

# List of DataFrames to merge for SQ and VS
sq_dataframes = [standard_SQ, porteria_SQ, extra_SQ, regular_SQ]
vs_dataframes = [standard_VS, porteria_VS, extra_VS, regular_VS]

# Apply the merge function
stats_SQ = merge_stats(sq_dataframes)
stats_VS = merge_stats(vs_dataframes)

In [6]:
print(stats_SQ)

              Equipo  PJ_x  PENEXEC_favor  TPint_favor  TA_contra_x  \
0             Alavés    38            NaN          NaN          NaN   
1      Athletic Club    38            NaN          NaN          NaN   
2    Atlético Madrid    38            NaN          NaN          NaN   
3          Barcelona    38            NaN          NaN          NaN   
4         Celta Vigo    38            NaN          NaN          NaN   
..               ...   ...            ...          ...          ...   
475      Real Madrid    38            8.0         12.0         76.0   
476    Real Sociedad    38            9.0         10.0         77.0   
477          Sevilla    38            2.0          3.0         97.0   
478         Valencia    38            8.0          9.0        134.0   
479       Villarreal    38            5.0          6.0         81.0   

     TR_contra_x Season  TPint_contra  PENEXEC_contra  PDet_contra  ...  \
0            NaN   9899           NaN             NaN          NaN  ... 

In [7]:
stats_SQ.to_excel('Datasets\STATS_SQ.xlsx')
stats_VS.to_excel('Datasets\STATS_VS.xlsx')

# Now we are gonna use Transfermkt data

In [8]:
# Import data
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506', 
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314', '1415']

ta = {}

for t in time:            
    ta[t] = pd.read_excel('Data\Transfermkt\TRANSFERMKT DATA.xlsx', sheet_name = t)
    ta[t]['Season'] = t

ta = pd.concat(ta.values(), ignore_index = True)
ta.drop(['Fouls', 'Points', 'Defence rate', 'Matches', '2TA+TR', 'TA2_favor'], axis = 1, inplace = True)

In [9]:
ta.rename(columns = {'Club' : 'Equipo', 'TA' : 'TA_contra', 'TR' : 'TR_contra',
                    '2TA' : 'TA2_contra', 'Successful Conceded Penalties' : 'PENEXEC_contra',
                    'Conceded penalties' : 'TPint_contra', 'Penalties received' : 'TPint_favor',
                    'Penalties receiver Scored' : 'PENEXEC_favor', 'CaughtOffside' : 'Offside_contra',
                    'Fouled' : 'Fls_recibidasContra'}, inplace = True)

ta['TA_contra'] = ta['TA_contra'] + ta['TA2_contra']

In [10]:
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'CF Extremadura (- 2010)' : 'Extremadura', 
        'Córdoba CF' : 'Córdoba', 'Deportivo Alavés' : 'Alavés', 
        'Deportivo de La Coruña' : 'La Coruña', 'Elche CF' : 'Elche', 
        'FC Barcelona' : 'Barcelona', 'Getafe CF' : 'Getafe', 
        'Gimnàstic de Tarragona' : 'Gimnàstic', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 'Málaga CF' : 'Málaga', 
        'Racing Santander' : 'Racing Sant', 'RCD Espanyol Barcelona' : 'Espanyol', 
        'RCD Mallorca' : 'Mallorca', 'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 
        'Real Murcia', 'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'UD Salamanca (- 2013)' : 'Salamanca', 
        'Valencia CF' : 'Valencia', 'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 
        'Villarreal', 'Xerez CD' : 'Xerez'
    }
    
    for old_value, new_value in replacements.items():
        df['Equipo'] = df['Equipo'].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(ta)

ta['Equipo'].unique()
#EXTREMADURA
#SALAMANCA

array(['Mallorca', 'CF Extremadura (- 2010)', 'Barcelona', 'La Coruña',
       'Real Madrid', 'Oviedo', 'Valencia', 'Racing Sant', 'Zaragoza',
       'Betis', 'Valladolid', 'Athletic Club', 'UD Salamanca (- 2013)',
       'Celta Vigo', 'Espanyol', 'Atlético Madrid', 'Villarreal',
       'Alavés', 'Real Sociedad', 'Tenerife', 'Rayo Vallecano',
       'Numancia', 'Málaga', 'Sevilla', 'Osasuna', 'Las Palmas',
       'Recreativo', 'Albacete', 'Real Murcia', 'Levante', 'Getafe',
       'Cádiz', 'Gimnàstic', 'Almería', 'Sporting Gijón', 'Xerez',
       'Hércules', 'Granada', 'Elche', 'Eibar', 'Córdoba'], dtype=object)

In [11]:
ta.drop(['TA_favor', 'TR_favor'], axis = 1, inplace = True)

In [12]:
ta

Unnamed: 0,#,Equipo,TA_contra,TA2_contra,TR_contra,TPint_favor,PENEXEC_favor,Penalties received Missed,PENEXEC_contra,TPint_contra,Missed Conceded penalties,Season,Offside_contra,Fls_recibidasContra,Rating
0,1,Mallorca,82,2,0,6.0,6.0,0.0,3.0,3.0,0.0,9899,,,
1,2,CF Extremadura (- 2010),75,1,2,2.0,2.0,0.0,2.0,4.0,2.0,9899,,,
2,3,Barcelona,79,6,1,8.0,7.0,1.0,3.0,3.0,0.0,9899,,,
3,4,La Coruña,85,3,2,4.0,3.0,1.0,4.0,4.0,0.0,9899,,,
4,5,Real Madrid,95,3,1,9.0,9.0,0.0,3.0,4.0,1.0,9899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,16,Valencia,103,5,4,,,,4.0,8.0,4.0,1415,67.0,613.0,6.99
336,17,Espanyol,117,4,2,,,,0.0,1.0,1.0,1415,69.0,535.0,6.73
337,18,Celta Vigo,117,1,4,,,,5.0,7.0,2.0,1415,110.0,495.0,6.82
338,19,Málaga,109,3,5,,,,3.0,6.0,3.0,1415,125.0,509.0,6.70


# Merging Data

In [13]:
#We need to merge STATS_SQ with ta
SQF = pd.merge(stats_SQ, ta, on = ['Equipo', 'Season'], how = "outer")
SQF
## WE NEED TO CORRECT

Unnamed: 0,Equipo,PJ_x,PENEXEC_favor_x,TPint_favor_x,TA_contra_x,TR_contra_x,Season,TPint_contra_x,PENEXEC_contra_x,PDet_contra,...,TR_contra,TPint_favor_y,PENEXEC_favor_y,Penalties received Missed,PENEXEC_contra_y,TPint_contra_y,Missed Conceded penalties,Offside_contra_y,Fls_recibidasContra_y,Rating
0,Alavés,38.0,,,,,9899,,,,...,4.0,7.0,6.0,1.0,1.0,1.0,0.0,,,
1,Athletic Club,38.0,,,,,9899,,,,...,3.0,7.0,5.0,2.0,4.0,4.0,0.0,,,
2,Atlético Madrid,38.0,,,,,9899,,,,...,3.0,9.0,7.0,2.0,5.0,5.0,0.0,,,
3,Barcelona,38.0,,,,,9899,,,,...,1.0,8.0,7.0,1.0,3.0,3.0,0.0,,,
4,Celta Vigo,38.0,,,,,9899,,,,...,2.0,6.0,6.0,0.0,4.0,4.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,Sevilla,38.0,2.0,3.0,97.0,4.0,2122,4.0,2.0,90.0,...,,,,,,,,,,
478,Valencia,38.0,8.0,9.0,134.0,8.0,2122,8.0,7.0,112.0,...,,,,,,,,,,
479,Villarreal,38.0,5.0,6.0,81.0,1.0,2122,5.0,4.0,91.0,...,,,,,,,,,,
480,CF Extremadura (- 2010),,,,,,9899,,,,...,2.0,2.0,2.0,0.0,2.0,4.0,2.0,,,


In [14]:
SQF.describe()

Unnamed: 0,PJ_x,PENEXEC_favor_x,TPint_favor_x,TA_contra_x,TR_contra_x,TPint_contra_x,PENEXEC_contra_x,PDet_contra,PFail_contra,TA_contra_y,...,TR_contra,TPint_favor_y,PENEXEC_favor_y,Penalties received Missed,PENEXEC_contra_y,TPint_contra_y,Missed Conceded penalties,Offside_contra_y,Fls_recibidasContra_y,Rating
count,480.0,460.0,460.0,460.0,460.0,140.0,140.0,460.0,140.0,460.0,...,340.0,20.0,20.0,20.0,339.0,339.0,339.0,120.0,120.0,120.0
mean,38.0,4.445652,5.715217,90.619565,5.465217,6.314286,4.8,125.584783,0.35,90.619565,...,2.897059,5.45,4.95,0.5,4.315634,5.20649,0.890855,101.458333,519.341667,6.847167
std,0.0,2.320526,2.684456,31.750814,3.118727,2.800529,2.45887,27.981619,0.561665,31.750814,...,1.742576,2.089447,1.959457,0.688247,2.140914,2.265999,1.021893,21.402464,53.19956,0.155678
min,38.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,0.0,0.0,...,0.0,2.0,1.0,0.0,0.0,1.0,0.0,58.0,391.0,6.59
25%,38.0,3.0,4.0,84.0,3.0,4.0,3.0,106.0,0.0,84.0,...,2.0,4.0,4.0,0.0,3.0,4.0,0.0,84.0,484.75,6.75
50%,38.0,4.0,6.0,98.0,6.0,6.0,5.0,123.0,0.0,98.0,...,3.0,5.5,5.0,0.0,4.0,5.0,1.0,100.0,515.5,6.815
75%,38.0,6.0,7.0,109.0,8.0,8.0,6.0,140.25,1.0,109.0,...,4.0,7.0,6.0,1.0,6.0,7.0,1.0,115.5,553.5,6.9
max,38.0,12.0,19.0,150.0,15.0,16.0,15.0,222.0,2.0,150.0,...,10.0,9.0,9.0,2.0,13.0,14.0,4.0,159.0,675.0,7.3


# VS DATA

In [15]:
stats_VS

Unnamed: 0,Equipo,PJ_x,PENEXEC_contra,TPint_contra,TA_favor_x,TR_favor_x,Season,TPint_favor,PENEXEC_favor,PDet_favor,...,TA2_favor,Fls_cometidasContra,Fls_recibidasPor,Offside_favor,RL,PJ_y,PG,PE,PP,DG
0,Alavés,38,,,,,9899,,,,...,,,,,16,19,10,4,5,
1,Athletic Club,38,,,,,9899,,,,...,,,,,8,19,10,5,4,
2,Atlético Madrid,38,,,,,9899,,,,...,,,,,13,19,8,7,4,
3,Barcelona,38,,,,,9899,,,,...,,,,,1,19,13,3,3,
4,Celta Vigo,38,,,,,9899,,,,...,,,,,5,19,12,6,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,Real Madrid,38,3.0,5.0,82.0,3.0,2122,12.0,8.0,171.0,...,1.0,501.0,370.0,30.0,1,19,13,5,1,31.0
476,Real Sociedad,38,5.0,6.0,95.0,2.0,2122,10.0,9.0,112.0,...,2.0,493.0,465.0,86.0,6,19,10,5,4,7.0
477,Sevilla,38,2.0,4.0,98.0,4.0,2122,3.0,2.0,90.0,...,2.0,497.0,441.0,53.0,4,19,12,6,1,19.0
478,Valencia,38,7.0,8.0,133.0,7.0,2122,9.0,8.0,92.0,...,4.0,603.0,616.0,97.0,9,19,6,8,5,2.0
