In [1]:
# Import libraries
import pandas as pd
import os
import numpy as np

# Set working directory
os.chdir('C:\\Users\\aaron\\Desktop\\GitHub\\Thesis\\Python')

In [2]:
# Import data per year
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506', 
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314', 
        '1415', '1516', '1617', '1718', '1819', '1920', '2021', '2122']

stat = ['STANDARD', 'PORTERIA', 'EXTRA', 'REGULAR']

locvis = ['SQ', 'VS']

# Loop to create empty dictionaries dynamically
for s in stat:
    for lv in locvis:
        # Dynamically generate the variable name and assign an empty dictionary
        dict_name = f"{s.lower()}_{lv}"
        globals()[dict_name] = {}

# Iterate over the list of years, stats and home status
for styp in stat:
    for i in time:
        for local in locvis:
            
            # Define the filepath for the file to be imported
            filename = f'{styp} {local} {i}.xlsx'
            typecat = f'{styp}'            
            filepath = os.path.join(r'Data\FBREF', typecat, filename)  # Use os.path.join to construct the path
            
            #print(filepath)
            
            # Omit the case PORTERIA & 9899 as that data doesn't exist
            if i == '9899' and styp == 'PORTERIA':
                continue
            
            ########################
            if styp == 'STANDARD':
                stdd = pd.read_excel(filepath, header = 1)
                stdd = stdd[['Equipo', 'PJ', 'TP', 'TPint', 'TA', 'TR']]
                stdd = stdd.copy()
                stdd.loc[:, 'Season'] = f'{i}'
                
                
                if local == 'SQ':
                    standard_SQ[filename] = stdd
                    standard_SQ[filename].rename(columns = {'TP' : 'PENEXEC_favor', 'TPint' : 'TPint_favor', 
                                               'TA' : 'TA_contra', 'TR' : 'TR_contra'}, 
                                    inplace = True)
                else:
                    standard_VS[filename] = stdd
                    standard_VS[filename].rename(columns = {'TP' : 'PENEXEC_contra', 'TPint' : 'TPint_contra',
                                               'TA' : 'TA_favor', 'TR' : 'TR_favor'}, 
                                    inplace = True)
            
            ########################
            if styp == 'PORTERIA':
                pdd = pd.read_excel(filepath, header = 1)
                pdd = pdd[['Equipo', 'TPint', 'PD', 'Salvadas', 'PC']]
                pdd = pdd.copy()
                pdd.loc[:, 'Season'] = f'{i}'
                     
                if local == 'SQ':
                    porteria_SQ[filename] = pdd
                    porteria_SQ[filename].rename(columns = {'TPint' : 'TPint_contra', 'PD' : 'PENEXEC_contra', 
                                               'Salvadas' : 'PDet_contra', 'PC' : 'PFail_contra'}, 
                                    inplace = True)
                else:
                    porteria_VS[filename] = pdd
                    porteria_VS[filename].rename(columns = {'TPint' : 'TPint_favor', 'PD' : 'PENEXEC_favor', 
                                               'Salvadas' : 'PDet_favor', 'PC' : 'PFail_favor'}, 
                                    inplace = True)
                
            ########################
            if styp == 'EXTRA':
                edd = pd.read_excel(filepath, header = 1)
                edd = edd[['Equipo', 'TA', 'TR', '2a amarilla', 'Fls', 'FR', 'PA']]
                edd = edd.copy()
                edd.loc[:, 'Season'] = f'{i}'

                if local == 'SQ':
                    extra_SQ[filename] = edd
                    extra_SQ[filename].rename(columns = {'TA' : 'TA_contra', 'TR' : 'TR_contra', 
                                            '2a amarilla' : 'TA2_contra', 'Fls' : 'Fls_cometidasPor',
                                            'FR' : 'Fls_recibidasContra', 'PA' : 'Offside_contra'}, 
                                 inplace = True)
                else:
                    extra_VS[filename] = edd
                    extra_VS[filename].rename(columns = {'TA' : 'TA_favor', 'TR' : 'TR_favor', 
                                            '2a amarilla' : 'TA2_favor', 'Fls' : 'Fls_cometidasContra',
                                            'FR' : 'Fls_recibidasPor', 'PA' : 'Offside_favor'}, 
                                 inplace = True)
                    
            ########################
            if styp == 'REGULAR':
                rdd = pd.read_excel(filepath)
                rdd = rdd[['RL', 'Equipo', 'PJ', 'PG', 'PE', 'PP', 'DG']]
                rdd = rdd.copy()
                rdd.loc[:, 'Season'] = f'{i}'
                
                if local == 'SQ':
                    regular_SQ[filename] = rdd
                else:
                    regular_VS[filename] = rdd
                
                

In [3]:
# Loop to Concatenate all the keys in the dictionaries in just one dictionary
for s in stat:
    for lv in locvis:
        # Generate the dictionary name dynamically
        dict_name = f"{s.lower()}_{lv}"
        
        # Retrieve the actual dictionary from globals()
        data_dict = globals()[dict_name]
        
        # Concatenate the values of the dictionary (assuming the dictionary holds DataFrames)
        globals()[dict_name] = pd.concat(data_dict.values(), ignore_index=True)

In [4]:
# Function to clean the names of the teams
def clean_equipo_column(df):
    replacements = {
        'Ã¡': 'á', 'Ã ': 'à', 'Ã©': 'é', 'Ã­': 'í', 'Ã³': 'ó', 
        'Ã±': 'ñ', 'GimnÃ stic': 'Gimnàstic',
        'Deportivo La Coruña': 'La Coruña', 'Real Betis': 'Betis',
        'vs. ': ''
    }
    
    for old_value, new_value in replacements.items():
        df['Equipo'] = df['Equipo'].str.replace(old_value, new_value, regex = True)

# Apply the function to the dataframes
dataframes = [standard_SQ, porteria_SQ, extra_SQ, regular_SQ, standard_VS, porteria_VS, extra_VS, regular_VS]
for df in dataframes:
    clean_equipo_column(df)

In [5]:
def merge_stats(dataframes):
    # Start with the first DataFrame in the list
    merged_df = dataframes[0]
    
    # Merge with the remaining DataFrames
    for df in dataframes[1:]:
        merged_df = pd.merge(merged_df, df, on=['Equipo', 'Season'], how = "outer")
    
    return merged_df

# List of DataFrames to merge for SQ and VS
sq_dataframes = [standard_SQ, porteria_SQ, extra_SQ, regular_SQ]
vs_dataframes = [standard_VS, porteria_VS, extra_VS, regular_VS]

# Apply the merge function
stats_SQ = merge_stats(sq_dataframes)
stats_VS = merge_stats(vs_dataframes)

In [6]:
print(stats_SQ)

              Equipo  PJ_x  PENEXEC_favor  TPint_favor  TA_contra_x  \
0             Alavés    38            NaN          NaN          NaN   
1      Athletic Club    38            NaN          NaN          NaN   
2    Atlético Madrid    38            NaN          NaN          NaN   
3          Barcelona    38            NaN          NaN          NaN   
4         Celta Vigo    38            NaN          NaN          NaN   
..               ...   ...            ...          ...          ...   
475      Real Madrid    38            8.0         12.0         76.0   
476    Real Sociedad    38            9.0         10.0         77.0   
477          Sevilla    38            2.0          3.0         97.0   
478         Valencia    38            8.0          9.0        134.0   
479       Villarreal    38            5.0          6.0         81.0   

     TR_contra_x Season  TPint_contra  PENEXEC_contra  PDet_contra  ...  \
0            NaN   9899           NaN             NaN          NaN  ... 

In [7]:
# Generate Excel output
stats_SQ.to_excel('Datasets\STATS_SQ.xlsx')
stats_VS.to_excel('Datasets\STATS_VS.xlsx')

# Now we are gonna use Transfermkt data

In [8]:
# Import data
time = ['9899', '9900', '0001', '0102', '0203', '0304', '0405', '0506', 
        '0607', '0708', '0809', '0910', '1011', '1112', '1213', '1314', '1415']

ta = {}

for t in time:            
    ta[t] = pd.read_excel('Data\Transfermkt\TRANSFERMKT DATA.xlsx', sheet_name = t)
    ta[t]['Season'] = t

ta = pd.concat(ta.values(), ignore_index = True)
ta.drop(['Fouls', 'Points', 'Defence rate', 'Matches', '2TA+TR', 'TA2_favor'], axis = 1, inplace = True)

In [9]:
# Rename variables
ta.rename(columns = {'Club' : 'Equipo', 'TA' : 'TA_contra', 'TR' : 'TR_contra',
                    '2TA' : 'TA2_contra', 'Successful Conceded Penalties' : 'PENEXEC_contra',
                    'Conceded penalties' : 'TPint_contra', 'Penalties received' : 'TPint_favor',
                    'Penalties receiver Scored' : 'PENEXEC_favor', 'CaughtOffside' : 'Offside_contra',
                    'Fouled' : 'Fls_recibidasContra'}, inplace = True)

# Define Yellow Cards against variable
ta['TA_contra'] = ta['TA_contra'] + ta['TA2_contra']

In [10]:
# Standarize the name of the teams
def standard_teams(df):
    replacements = {
        'Albacete Balompié' : 'Albacete', 'Athletic Bilbao' : 'Athletic Club', 
        'Atlético de Madrid' : 'Atlético Madrid', 'CA Osasuna' : 'Osasuna', 
        'Cádiz CF' : 'Cádiz', 'CD Numancia' : 'Numancia', 'CD Tenerife' : 'Tenerife', 
        'Celta de Vigo' : 'Celta Vigo', 'CF Extremadura (- 2010)' : 'Extremadura', 
        'Córdoba CF' : 'Córdoba', 'Deportivo Alavés' : 'Alavés', 
        'Deportivo de La Coruña' : 'La Coruña', 'Elche CF' : 'Elche', 
        'FC Barcelona' : 'Barcelona', 'Getafe CF' : 'Getafe', 
        'Gimnàstic de Tarragona' : 'Gimnàstic', 'Granada CF' : 'Granada', 
        'Hércules CF' : 'Hércules', 'Levante UD' : 'Levante', 'Málaga CF' : 'Málaga', 
        'Racing Santander' : 'Racing Sant', 'RCD Espanyol Barcelona' : 'Espanyol', 
        'RCD Mallorca' : 'Mallorca', 'Real Betis Balompié' : 'Betis', 'Real Murcia CF' : 
        'Real Murcia', 'Real Oviedo' : 'Oviedo', 'Real Valladolid' : 'Valladolid', 
        'Real Zaragoza' : 'Zaragoza', 'Recreativo Huelva' : 'Recreativo', 
        'SD Eibar' : 'Eibar', 'Sevilla FC' : 'Sevilla', 'UD Almería' : 'Almería', 
        'UD Las Palmas' : 'Las Palmas', 'UD Salamanca (- 2013)' : 'Salamanca', 
        'Valencia CF' : 'Valencia', 'Valladolid CF' : 'Valladolid', 'Villarreal CF' : 
        'Villarreal', 'Xerez CD' : 'Xerez', 
        r'CF Extremadura\s*\(-\s*2010\)' : 'Extremadura',
        r'UD Salamanca\s*\(-\s*2013\)' : 'Salamanca',
    }
    
    for old_value, new_value in replacements.items():
        df['Equipo'] = df['Equipo'].str.replace(old_value, new_value, regex = True)

# Apply the function
standard_teams(ta)

# Check for correct team names
ta['Equipo'].unique()

array(['Mallorca', 'Extremadura', 'Barcelona', 'La Coruña', 'Real Madrid',
       'Oviedo', 'Valencia', 'Racing Sant', 'Zaragoza', 'Betis',
       'Valladolid', 'Athletic Club', 'Salamanca', 'Celta Vigo',
       'Espanyol', 'Atlético Madrid', 'Villarreal', 'Alavés',
       'Real Sociedad', 'Tenerife', 'Rayo Vallecano', 'Numancia',
       'Málaga', 'Sevilla', 'Osasuna', 'Las Palmas', 'Recreativo',
       'Albacete', 'Real Murcia', 'Levante', 'Getafe', 'Cádiz',
       'Gimnàstic', 'Almería', 'Sporting Gijón', 'Xerez', 'Hércules',
       'Granada', 'Elche', 'Eibar', 'Córdoba'], dtype=object)

In [11]:
# Drop variables
ta.drop(['TA_favor', 'TR_favor'], axis = 1, inplace = True)

szn_transfermkt_9814 = ta
szn_transfermkt_9814

Unnamed: 0,#,Equipo,TA_contra,TA2_contra,TR_contra,TPint_favor,PENEXEC_favor,Penalties received Missed,PENEXEC_contra,TPint_contra,Missed Conceded penalties,Season,Offside_contra,Fls_recibidasContra,Rating
0,1,Mallorca,82,2,0,6.0,6.0,0.0,3.0,3.0,0.0,9899,,,
1,2,Extremadura,75,1,2,2.0,2.0,0.0,2.0,4.0,2.0,9899,,,
2,3,Barcelona,79,6,1,8.0,7.0,1.0,3.0,3.0,0.0,9899,,,
3,4,La Coruña,85,3,2,4.0,3.0,1.0,4.0,4.0,0.0,9899,,,
4,5,Real Madrid,95,3,1,9.0,9.0,0.0,3.0,4.0,1.0,9899,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,16,Valencia,103,5,4,,,,4.0,8.0,4.0,1415,67.0,613.0,6.99
336,17,Espanyol,117,4,2,,,,0.0,1.0,1.0,1415,69.0,535.0,6.73
337,18,Celta Vigo,117,1,4,,,,5.0,7.0,2.0,1415,110.0,495.0,6.82
338,19,Málaga,109,3,5,,,,3.0,6.0,3.0,1415,125.0,509.0,6.70


# Merging Data

In [12]:
#We need to merge STATS_SQ with ta
SQF = pd.merge(stats_SQ, szn_transfermkt_9814, on = ['Equipo', 'Season'], how = "outer")
SQF

Unnamed: 0,Equipo,PJ_x,PENEXEC_favor_x,TPint_favor_x,TA_contra_x,TR_contra_x,Season,TPint_contra_x,PENEXEC_contra_x,PDet_contra,...,TR_contra,TPint_favor_y,PENEXEC_favor_y,Penalties received Missed,PENEXEC_contra_y,TPint_contra_y,Missed Conceded penalties,Offside_contra_y,Fls_recibidasContra_y,Rating
0,Alavés,38,,,,,9899,,,,...,4.0,7.0,6.0,1.0,1.0,1.0,0.0,,,
1,Athletic Club,38,,,,,9899,,,,...,3.0,7.0,5.0,2.0,4.0,4.0,0.0,,,
2,Atlético Madrid,38,,,,,9899,,,,...,3.0,9.0,7.0,2.0,5.0,5.0,0.0,,,
3,Barcelona,38,,,,,9899,,,,...,1.0,8.0,7.0,1.0,3.0,3.0,0.0,,,
4,Celta Vigo,38,,,,,9899,,,,...,2.0,6.0,6.0,0.0,4.0,4.0,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,Real Madrid,38,8.0,12.0,76.0,0.0,2122,5.0,3.0,97.0,...,,,,,,,,,,
476,Real Sociedad,38,9.0,10.0,77.0,4.0,2122,6.0,5.0,89.0,...,,,,,,,,,,
477,Sevilla,38,2.0,3.0,97.0,4.0,2122,4.0,2.0,90.0,...,,,,,,,,,,
478,Valencia,38,8.0,9.0,134.0,8.0,2122,8.0,7.0,112.0,...,,,,,,,,,,


In [13]:
# The following is to performm a check on which variables will need a further adjustment

# List of prefixes for the column groups we want to process
column_prefixes = ['PJ', 'PENEXEC_favor', 'TPint_favor', 'TA_contra', 'TR_contra', 
                   'TPint_contra', 'PENEXEC_contra', 'TA2_contra', 'Fls_recibidasContra',
                  'Offside_contra']

# Loop through each prefix
for prefix in column_prefixes:
    col_x = f'{prefix}_x'
    col_y = f'{prefix}_y'
    col_a = f'{prefix}_a'

    # Apply the np.where condition
    SQF[col_a] = np.where(SQF[col_x] == SQF[col_y], 1, 0)

In [14]:
# With this check... we realise that if the mean = min = max then var_x == var _y
# and we wont need to do further changes. Otherwise, we will.
SQF[['PJ_a', 'PENEXEC_favor_a','TPint_favor_a', 
     'TA_contra_a', 'TR_contra_a', 'TPint_contra_a', 
     'PENEXEC_contra_a', 'TA2_contra_a', 'Fls_recibidasContra_a',
    'Offside_contra_a']].describe()

# We realize that we won't need to do a check only for the PJ variable.

Unnamed: 0,PJ_a,PENEXEC_favor_a,TPint_favor_a,TA_contra_a,TR_contra_a,TPint_contra_a,PENEXEC_contra_a,TA2_contra_a,Fls_recibidasContra_a,Offside_contra_a
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,1.0,0.0,0.0,0.958333,0.958333,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.200035,0.200035,0.0,0.0,0.0,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# List of prefixes for the column groups we want to process
column_prefixes = ['PENEXEC_favor', 'TPint_favor', 'TA_contra', 'TR_contra', 
                   'TPint_contra', 'PENEXEC_contra', 'TA2_contra', 'Fls_recibidasContra',
                  'Offside_contra']

# Loop through each prefix
for prefix in column_prefixes:
    col_x = f'{prefix}_x'
    col_y = f'{prefix}_y'
    col_a = f'{prefix}_a'

    # Replace values on variable_x when they have a NA
    SQF[col_x] = np.where((SQF[col_x] != SQF[col_y]) & (SQF[col_x].isna()), SQF[col_y], SQF[col_x])
    
    # Drop auxiliary variables
    SQF.drop([col_a], axis = 1, inplace = True)
    SQF.drop([col_y], axis = 1, inplace = True)
    
    # Rename variables
    SQF.rename(columns = {col_x : prefix}, inplace = True)

# Drop and rename variables
SQF.drop(['PJ_y', 'PJ_a'], axis = 1, inplace = True)
SQF.rename(columns = {'PJ_x' : 'PJ'}, inplace = True)

# Define variables with NaN value
SQF[['TA_favor', 'TR_favor', 'TA2_favor']] = float('nan')

In [16]:
# Drop variables
SQF.drop(['PENEXEC_favor', 'PENEXEC_contra', 'Missed Conceded penalties',
         'Rating', 'Penalties received Missed', 'PDet_contra', 'PFail_contra', '#'],
        axis = 1, inplace = True)

In [17]:
# Check
SQF.to_excel('Datasets\STATS_updatedmerge.xlsx')
SQF.columns

Index(['Equipo', 'PJ', 'TPint_favor', 'TA_contra', 'TR_contra', 'Season',
       'TPint_contra', 'TA2_contra', 'Fls_cometidasPor', 'Fls_recibidasContra',
       'Offside_contra', 'RL', 'PG', 'PE', 'PP', 'DG', 'TA_contra',
       'TR_contra', 'TA_favor', 'TR_favor', 'TA2_favor'],
      dtype='object')

In [18]:
SQF

Unnamed: 0,Equipo,PJ,TPint_favor,TA_contra,TR_contra,Season,TPint_contra,TA2_contra,Fls_cometidasPor,Fls_recibidasContra,...,RL,PG,PE,PP,DG,TA_contra.1,TR_contra.1,TA_favor,TR_favor,TA2_favor
0,Alavés,38,7.0,,,9899,1.0,5.0,,,...,16,11,7,20,-27,116.0,4.0,,,
1,Athletic Club,38,7.0,,,9899,4.0,6.0,,,...,8,17,9,12,6,93.0,3.0,,,
2,Atlético Madrid,38,9.0,,,9899,5.0,7.0,,,...,13,12,10,16,4,103.0,3.0,,,
3,Barcelona,38,8.0,,,9899,3.0,6.0,,,...,1,24,7,7,44,79.0,1.0,,,
4,Celta Vigo,38,6.0,,,9899,4.0,2.0,,,...,5,17,13,8,28,107.0,2.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,Real Madrid,38,12.0,76.0,0.0,2122,5.0,0.0,394.0,474.0,...,1,26,8,4,49,,,,,
476,Real Sociedad,38,10.0,77.0,4.0,2122,6.0,1.0,491.0,465.0,...,6,17,11,10,3,,,,,
477,Sevilla,38,3.0,97.0,4.0,2122,4.0,2.0,462.0,462.0,...,4,18,16,4,23,,,,,
478,Valencia,38,9.0,134.0,8.0,2122,8.0,6.0,641.0,586.0,...,9,11,15,12,-5,,,,,


In [19]:
#xxx
#SQF[column_prefixes].isna().describe()
#Lets compare this with the stata data

# VS DATA

In [20]:
stats_VS.columns
#We need to make the same adjustments here than before

Index(['Equipo', 'PJ_x', 'PENEXEC_contra', 'TPint_contra', 'TA_favor_x',
       'TR_favor_x', 'Season', 'TPint_favor', 'PENEXEC_favor', 'PDet_favor',
       'PFail_favor', 'TA_favor_y', 'TR_favor_y', 'TA2_favor',
       'Fls_cometidasContra', 'Fls_recibidasPor', 'Offside_favor', 'RL',
       'PJ_y', 'PG', 'PE', 'PP', 'DG'],
      dtype='object')

In [21]:
# Keep relevant columns
stats_VS = stats_VS[['Equipo', 'Season', 'TA_favor_x', 'TR_favor_x', 'TA2_favor',
          'RL', 'PG', 'PE', 'PP', 'DG']]

stats_VS.rename(columns = {'TA_favor_x' : 'TA_favor', 'TR_favor_x' : 'TR_favor'}, inplace = True)
stats_VS.columns

Index(['Equipo', 'Season', 'TA_favor', 'TR_favor', 'TA2_favor', 'RL', 'PG',
       'PE', 'PP', 'DG'],
      dtype='object')

In [22]:
# Import the Cards and Fauls data
FAVOR_CARDSNFOULS = pd.read_excel('Datasets\FAVOR_CARDSNFOULS.xlsx', dtype = {'Season' : str})
FAVOR_CARDSNFOULS

Unnamed: 0,Equipo,Season,TA_favor,TR_favor,Fls_recibidasContra
0,Alavés,0506,93,3,809
1,Athletic Club,0506,101,9,671
2,Atlético Madrid,0506,103,6,733
3,Barcelona,0506,114,10,801
4,Betis,0506,105,6,723
...,...,...,...,...,...
155,Real Sociedad,1213,127,12,613
156,Sevilla,1213,119,8,537
157,Valencia,1213,103,12,538
158,Valladolid,1213,68,7,421


In [23]:
# Merge VS data with the Fouls and Cards data
VSF = pd.merge(stats_VS, FAVOR_CARDSNFOULS, on = ['Equipo', 'Season'], how = "outer")
VSF.columns

Index(['Equipo', 'Season', 'TA_favor_x', 'TR_favor_x', 'TA2_favor', 'RL', 'PG',
       'PE', 'PP', 'DG', 'TA_favor_y', 'TR_favor_y', 'Fls_recibidasContra'],
      dtype='object')

In [24]:
# Check
VSF['TA_favor_a'] = np.where(VSF['TA_favor_x'] == VSF['TA_favor_y'], 1, 0)
VSF['TR_favor_a'] = np.where(VSF['TR_favor_x'] == VSF['TR_favor_y'], 1, 0)

VSF[['TA_favor_a', 'TR_favor_a']].describe()
# We realize that we need to make the adjustment for noth variables

Unnamed: 0,TA_favor_a,TR_favor_a
count,480.0,480.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


In [25]:
# Adjust values where there's NaN and drop aux variables
VSF['TA_favor_x'] = np.where((VSF['TA_favor_x'] != VSF['TA_favor_y']) & (VSF['TA_favor_x'].isna()), 
                             VSF['TA_favor_y'], VSF['TA_favor_x'])

VSF['TR_favor_x'] = np.where((VSF['TR_favor_x'] != VSF['TR_favor_y']) & (VSF['TR_favor_x'].isna()), 
                             VSF['TR_favor_y'], VSF['TR_favor_x'])

VSF.drop(['TA_favor_y', 'TR_favor_y', 'TA_favor_a', 'TR_favor_a'], axis = 1, inplace = True)
VSF.rename(columns = {'TA_favor_x' : 'TA_favor', 'TR_favor_x' : 'TR_favor'}, inplace = True)

VSF.columns

Index(['Equipo', 'Season', 'TA_favor', 'TR_favor', 'TA2_favor', 'RL', 'PG',
       'PE', 'PP', 'DG', 'Fls_recibidasContra'],
      dtype='object')

In [26]:
VSF.describe()

Unnamed: 0,TA_favor,TR_favor,TA2_favor,RL,PG,PE,PP,DG,Fls_recibidasContra
count,320.0,320.0,160.0,480.0,480.0,480.0,480.0,160.0,160.0
mean,98.28125,5.746875,2.475,10.5,9.008333,4.81875,5.172917,7.06875,623.05
std,14.618523,3.003914,1.704415,5.772297,3.236653,2.055204,2.504546,14.447529,82.360521
min,58.0,0.0,0.0,1.0,1.0,0.0,0.0,-23.0,421.0
25%,88.0,4.0,1.0,5.75,7.0,3.0,4.0,-2.0,562.75
50%,98.0,6.0,2.0,10.5,9.0,5.0,5.0,4.0,614.0
75%,109.0,7.25,4.0,15.25,11.0,6.0,7.0,13.0,689.25
max,139.0,19.0,9.0,20.0,18.0,13.0,13.0,54.0,809.0


# Merging both datasets SQF & VSF

In [27]:
# Only update columns from SQF that exist in both dataframes
common_cols = SQF.columns.intersection(VSF.columns)

# Update values in df with non-null values from sqf
SQF.update(VSF[common_cols])

SQVS = SQF

In [28]:
SQVS.columns

Index(['Equipo', 'PJ', 'TPint_favor', 'TA_contra', 'TR_contra', 'Season',
       'TPint_contra', 'TA2_contra', 'Fls_cometidasPor', 'Fls_recibidasContra',
       'Offside_contra', 'RL', 'PG', 'PE', 'PP', 'DG', 'TA_contra',
       'TR_contra', 'TA_favor', 'TR_favor', 'TA2_favor'],
      dtype='object')

In [29]:
SQVS['Year'] = np.where((SQVS['Season'] == '9899'), 1998,
                np.where((SQVS['Season'] == '9900'), 1999,
                np.where((SQVS['Season'] == '0001'), 2000,
                np.where((SQVS['Season'] == '0102'), 2001,
                np.where((SQVS['Season'] == '0203'), 2002,
                np.where((SQVS['Season'] == '0304'), 2003,
                np.where((SQVS['Season'] == '0405'), 2004,
                np.where((SQVS['Season'] == '0506'), 2005,
                np.where((SQVS['Season'] == '0607'), 2006,
                np.where((SQVS['Season'] == '0708'), 2007,
                np.where((SQVS['Season'] == '0809'), 2008,
                np.where((SQVS['Season'] == '0910'), 2009,
                np.where((SQVS['Season'] == '1011'), 2010,
                np.where((SQVS['Season'] == '1112'), 2011,
                np.where((SQVS['Season'] == '1213'), 2012,
                np.where((SQVS['Season'] == '1314'), 2013,
                np.where((SQVS['Season'] == '1415'), 2014,
                np.where((SQVS['Season'] == '1516'), 2015,
                np.where((SQVS['Season'] == '1617'), 2016,
                np.where((SQVS['Season'] == '1718'), 2017,
                np.where((SQVS['Season'] == '1819'), 2018,
                np.where((SQVS['Season'] == '1920'), 2019,
                np.where((SQVS['Season'] == '2021'), 2020,
                np.where((SQVS['Season'] == '2122'), 2021, SQVS['Season']))))))))))))))))))))))))

In [30]:
# Order columns
SQVS = SQVS.loc[:, ['RL', 'Equipo', 'Season', 'PJ', 
                    'TA_contra', 'TR_contra', 'TA2_contra', 
                    'TA_favor', 'TR_favor', 'TA2_favor',
                   'TPint_favor', 'TPint_contra', 'Fls_cometidasPor', 'Fls_recibidasContra',
                   'Offside_contra', 'PG', 'PE', 'PP', 'PG', 'DG', 'Year']]

SQVS.columns

Index(['RL', 'Equipo', 'Season', 'PJ', 'TA_contra', 'TA_contra', 'TR_contra',
       'TR_contra', 'TA2_contra', 'TA_favor', 'TR_favor', 'TA2_favor',
       'TPint_favor', 'TPint_contra', 'Fls_cometidasPor',
       'Fls_recibidasContra', 'Offside_contra', 'PG', 'PE', 'PP', 'PG', 'DG',
       'Year'],
      dtype='object')

In [31]:
# Adjust the repeated columns
SQVS.iloc[:, 4] = np.where(SQVS.iloc[:, 4].isna(), 
                           SQVS.iloc[:, 5], SQVS.iloc[:, 4])

SQVS.iloc[:, 6] = np.where(SQVS.iloc[:, 6].isna(), 
                           SQVS.iloc[:, 7], SQVS.iloc[:, 6])

print(SQVS.iloc[:, 4:8].isna().describe())

print(SQVS.columns)

       TA_contra TA_contra TR_contra TR_contra
count        480       480       480       480
unique         1         2         1         2
top        False     False     False     False
freq         480       340       480       340
Index(['RL', 'Equipo', 'Season', 'PJ', 'TA_contra', 'TA_contra', 'TR_contra',
       'TR_contra', 'TA2_contra', 'TA_favor', 'TR_favor', 'TA2_favor',
       'TPint_favor', 'TPint_contra', 'Fls_cometidasPor',
       'Fls_recibidasContra', 'Offside_contra', 'PG', 'PE', 'PP', 'PG', 'DG',
       'Year'],
      dtype='object')


In [32]:
# Save output
SQVS.to_excel('Datasets\SQVS.xlsx', index = False)

In [33]:
SQVS = pd.read_excel('Datasets\sqvs.xlsx')

# Drop repeated columns that are now redundant
SQVS.drop(['TA_contra.1', 'TR_contra.1', 'PG.1'], axis = 1, inplace = True)
SQVS.columns

Index(['RL', 'Equipo', 'Season', 'PJ', 'TA_contra', 'TR_contra', 'TA2_contra',
       'TA_favor', 'TR_favor', 'TA2_favor', 'TPint_favor', 'TPint_contra',
       'Fls_cometidasPor', 'Fls_recibidasContra', 'Offside_contra', 'PG', 'PE',
       'PP', 'DG', 'Year'],
      dtype='object')

In [34]:
# Minor adjustments
SQVS['Season'] = SQVS['Season'].astype(str)

SQVS['Season'] = np.where((SQVS['Season'] == '1'), '0001',
                np.where((SQVS['Season'] == '102'), '0102',
                np.where((SQVS['Season'] == '203'), '0203',
                np.where((SQVS['Season'] == '304'), '0304',
                np.where((SQVS['Season'] == '405'), '0405',
                np.where((SQVS['Season'] == '506'), '0506',
                np.where((SQVS['Season'] == '607'), '0607',
                np.where((SQVS['Season'] == '708'), '0708',
                np.where((SQVS['Season'] == '809'), '0809',
                np.where((SQVS['Season'] == '910'), '0910',
                SQVS['Season']))))))))))

SQVS['Equipo'] = SQVS['Equipo'].str.replace('La Coruña', 'Deportivo La Coruña', regex = True)
SQVS['Equipo'] = SQVS['Equipo'].str.replace('Racing Sant', 'Racing Santander')

# Save again
SQVS = pd.read_excel('Datasets\SQVS.xlsx')

In [35]:
SQVS['Equipo'].unique()

array(['Alavés', 'Athletic Club', 'Atlético Madrid', 'Barcelona',
       'Celta Vigo', 'La Coruña', 'Espanyol', 'Extremadura', 'Mallorca',
       'Oviedo', 'Racing Sant', 'Betis', 'Real Madrid', 'Real Sociedad',
       'Salamanca', 'Tenerife', 'Valencia', 'Valladolid', 'Villarreal',
       'Zaragoza', 'Málaga', 'Numancia', 'Rayo Vallecano', 'Sevilla',
       'Las Palmas', 'Osasuna', 'Recreativo', 'Albacete', 'Real Murcia',
       'Getafe', 'Levante', 'Cádiz', 'Gimnàstic', 'Almería',
       'Sporting Gijón', 'Xerez', 'Hércules', 'Granada', 'Elche',
       'Córdoba', 'Eibar', 'Leganés', 'Girona', 'Huesca'], dtype=object)