## Second round of processing ##

Here we will do another round of processing only this time we will take each of the values of the players, so we have for each attribute:
mean, min, max, sd.

Later we will split our modeling work into 2 ways:
1. Emphasis on explainability, using an RMD file
2. Emphasis on prediciton, using a jupyter notebook file

In [38]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Since we already processed for each season seperatly the ratings, there will be no problems unifying them

In [39]:
def third_largest(row):
    sorted_row = sorted(row, reverse=True)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def third_smallest(row):
    sorted_row = sorted(row)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def get_data_cols(df, att, prefix):
    '''
    Gets an attribute, and adds columns to show mean, max, min and sd (ignoring zeros) for the first num_players players.

    Parameters:
    df : The dataframe
    att: which attribute (e.g., Weight)
    prefix: HomePlayer or AwayPlayer
    '''

    player_weight_cols = [col for col in df.columns if col.startswith(f"{prefix}") and col.endswith(f"_{att}")]
    att = att.replace('(', ' ')
    if not player_weight_cols:
        print('no col with', att)
        return df

    # Select only the columns corresponding to the first num_players players
    player_weight_cols_subset = player_weight_cols[:11]  # Selecting the first 11 players

    # Replace zeros with NaN
    df[player_weight_cols_subset] = df[player_weight_cols_subset].replace(0, np.nan)


    if(att == 'Overall'):
        df[f"{prefix}_{att}_max"] = df[player_weight_cols_subset].max(axis=1, skipna=True)
        df[f"{prefix}_{att}_min"] = df[player_weight_cols_subset].min(axis=1, skipna=True)
        df[f"{prefix}_{att}_sd"] = df[player_weight_cols_subset].std(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_ln"] = np.log(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_sqrt"] = np.sqrt(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_3rd_best"] = df[player_weight_cols_subset].apply(third_largest, axis=1)
        df[f"{prefix}_3rd_worst"] = df[player_weight_cols_subset].apply(third_smallest, axis=1)


    # For bench players (i >= 12)
    bench_weight_cols = player_weight_cols[11:]  # Selecting players from index 12 and onwards

    # Replace zeros with NaN for bench players
    df[bench_weight_cols] = df[bench_weight_cols].replace(0, np.nan)

    if(' ' in att): # 2 worded attributes so instead of dribbling total we will have dribbling
        df[f"{prefix}_bench_{att.split()[0]}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att.split()[0]}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    else:
        df[f"{prefix}_bench_{att}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    if(att == 'Overall'):
        df[f"{prefix}_bench_{att}_max"] = df[bench_weight_cols].max(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_min"] = df[bench_weight_cols].min(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_sd"] = df[bench_weight_cols].std(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    return df

In [40]:
df = pd.read_csv("Club_Capicities.csv")

In [42]:
def pipeline(train_arr, test_arr):
    train_df = pd.read_csv(train_arr[0])
    test_df = pd.read_csv(test_arr[0])
    print('started loading train and test')
    for i in range (1, len(train_arr)):
        train_df = pd.concat([train_df, pd.read_csv(train_arr[i])], ignore_index=True, axis=0)
    print("Finished loading train")
    for i in range (1, len(test_arr)):
        test_df = pd.concat([test_df, pd.read_csv(test_arr[i])], ignore_index=True, axis=0)
    print("Finished Loading test")
    att_list = ['Overall', 'Age', 'Height(in cm)', 'Weight(in kg)']
    numerical_cols = []
    statistic_list = ['mean', 'min', 'max', 'sd', 'median', '3rd_best', '3rd_worst']
    #statistic_list = ['mean' ,'sd']
    cat_cols=['home_team_name', 'away_team_name', 'home_score', 'away_score', 'home_formation', 'away_fromation']
    #binary_cols = ['Home_Adv_Team', 'Strong_Away'] ## From ar
    

    for att in att_list:
        get_data_cols(train_df, att, prefix='HomePlayer')
        get_data_cols(train_df, att, prefix='AwayPlayer')
        get_data_cols(test_df, att, prefix='HomePlayer')
        get_data_cols(test_df, att, prefix='AwayPlayer')
    
    Club_Capacities = pd.read_csv("Club_Capicities.csv")
    # Create a dictionary-like structure for capacities
    capacities_dict = dict(zip(Club_Capacities['Club'], Club_Capacities['capacity']))
    for index, row in train_df.iterrows():
        home_team_name = train_df.at[index, 'home_team_name']
        train_df.at[index, 'capacity'] = capacities_dict[home_team_name]
        test_df.at[index, 'capacity'] = capacities_dict[home_team_name]
    print('finished canonizing data')
    for col in train_df.columns:
        for s in statistic_list:
            if s in col:
                numerical_cols.append(col)

    cat_cols=['home_team_name', 'away_team_name', 'home_score', 'away_score', 'home_formation', 
              'away_fromation', 'home_GD_prior', 
              'home_Points_prior', 'home_GD_form', 'home_Points_form', 'home_GD_form_pw', 'home_Points_form_pw',
                'away_GD_prior', 'away_Points_prior', 'away_GD_form', 'away_GD_form_pw', 'away_Points_form_pw',
                'away_Points_form','Matchweek', 'home_points_to_championship',
                'home_points_to_ucl','home_points_to_rel','away_points_to_championship',
                'away_points_to_ucl','away_points_to_rel', 'home_match_importance', 'away_match_importance', 'B365A', 'B365D', 'B365A']

    train_df[numerical_cols + cat_cols].to_csv("MatchDB/train.csv")
    test_df[numerical_cols + cat_cols].to_csv("MatchDB/test.csv")

In [43]:
pipeline(['MatchDB/epl1516_proccessed_with_form.csv',
           'MatchDB/epl2122_proccessed_with_form.csv',
          'MatchDB/epl1819_proccessed_with_form.csv', 
          'MatchDB/epl1617_proccessed_with_form.csv',
          'MatchDB/epl1718_proccessed_with_form.csv',
            'MatchDB/epl1415_proccessed_with_form.csv',
            'MatchDB/epl1920_proccessed_with_form.csv',
              'MatchDB/epl2021_proccessed_with_form.csv'],
              [ 'MatchDB/epl2223_proccessed_with_form.csv'])

started loading train and test
Finished loading train
Finished Loading test
finished canonizing data


In [44]:
train = pd.read_csv("MatchDB/train.csv")

In [45]:
pd.read_csv("MatchDB/test.csv").shape

(3040, 64)

In [46]:
train[train.isna().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,HomePlayer_Overall_max,HomePlayer_Overall_min,HomePlayer_Overall_sd,HomePlayer_Overall_mean,HomePlayer_Overall_mean_ln,HomePlayer_Overall_mean_sqrt,HomePlayer_3rd_best,HomePlayer_3rd_worst,HomePlayer_bench_Overall_mean,HomePlayer_bench_Overall_max,HomePlayer_bench_Overall_min,HomePlayer_bench_Overall_sd,AwayPlayer_Overall_max,AwayPlayer_Overall_min,AwayPlayer_Overall_sd,AwayPlayer_Overall_mean,AwayPlayer_Overall_mean_ln,AwayPlayer_Overall_mean_sqrt,AwayPlayer_3rd_best,AwayPlayer_3rd_worst,AwayPlayer_bench_Overall_mean,AwayPlayer_bench_Overall_max,AwayPlayer_bench_Overall_min,AwayPlayer_bench_Overall_sd,HomePlayer_bench_Age_mean,HomePlayer_Age_mean,AwayPlayer_bench_Age_mean,AwayPlayer_Age_mean,HomePlayer_bench_Height_mean,HomePlayer_Height_mean,AwayPlayer_bench_Height_mean,AwayPlayer_Height_mean,HomePlayer_bench_Weight_mean,HomePlayer_Weight_mean,AwayPlayer_bench_Weight_mean,AwayPlayer_Weight_mean,home_team_name,away_team_name,home_score,away_score,home_formation,away_fromation,home_GD_prior,home_Points_prior,home_GD_form,home_Points_form,home_GD_form_pw,home_Points_form_pw,away_GD_prior,away_Points_prior,away_GD_form,away_GD_form_pw,away_Points_form_pw,away_Points_form,Matchweek,home_points_to_championship,home_points_to_ucl,home_points_to_rel,away_points_to_championship,away_points_to_ucl,away_points_to_rel,home_match_importance,away_match_importance
533,533,83,67,4.803408,78.545455,4.361879,8.858662,82,77,78.545455,77.0,57.0,7.433034,79,74,1.439697,76.545455,4.337723,8.748673,77,75,76.545455,77.0,60.0,5.301991,26.111111,25.636364,21.000000,25.818182,183.888889,182.454545,179.000000,184.818182,78.333333,73.636364,72.000000,77.545455,Everton,Leeds United,3,0,4-4-2,,-13.0,19.0,-5.0,0.0,-0.714286,0.000000,-16.0,23.0,2.0,0.285714,1.000000,7.0,25,58.642857,38.642857,16.642857,58.357143,38.357143,16.357143,0.0,0.000000
641,641,90,74,4.490394,80.818182,4.390829,8.986792,83,79,80.818182,82.0,72.0,3.345810,81,66,3.935849,75.090909,4.317403,8.662721,77,74,75.090909,78.0,59.0,6.759767,24.000000,26.090909,21.222222,25.363636,183.777778,182.636364,182.555556,181.727273,78.444444,78.545455,73.777778,75.454545,Tottenham Hotspur,Leeds United,2,1,3-4-2-1,,-7.0,16.0,-2.0,7.0,-0.285714,1.000000,-7.0,11.0,1.0,0.142857,1.142857,8.0,12,25.407407,21.407407,9.407407,25.592593,21.592593,9.592593,0.0,0.000000
661,661,77,67,3.162278,74.000000,4.303209,8.600499,76,71,74.000000,76.0,67.0,2.877113,81,69,3.101319,76.272727,4.333543,8.731752,78,75,76.272727,75.0,60.0,5.099020,23.333333,25.181818,21.555556,25.000000,181.777778,183.272727,182.000000,182.545455,72.000000,76.000000,74.111111,75.090909,Norwich City,Leeds United,1,2,4-4-2,,-21.0,2.0,-11.0,2.0,-1.571429,0.285714,-8.0,7.0,-1.0,-0.142857,0.714286,5.0,10,21.931034,16.931034,6.931034,21.758621,16.758621,6.758621,0.0,0.000000
688,688,78,62,4.468069,73.818182,4.299804,8.587960,76,73,73.818182,77.0,66.0,3.674235,79,69,2.766685,75.363636,4.321697,8.679869,77,74,75.363636,75.0,60.0,4.594683,27.888889,24.090909,21.111111,25.363636,179.222222,181.818182,182.000000,182.000000,73.555556,73.090909,74.000000,75.272727,Southampton,Leeds United,1,0,4-4-2,,-5.0,4.0,-3.0,3.0,-0.428571,0.428571,-7.0,6.0,-3.0,-0.428571,0.714286,5.0,8,15.870968,13.870968,3.870968,15.806452,13.806452,3.806452,0.0,0.000000
730,730,83,75,2.572583,77.727273,4.352716,8.815223,79,76,77.727273,76.0,61.0,4.415880,81,74,2.000000,77.000000,4.343501,8.774295,78,75,77.000000,73.0,59.0,4.769696,29.777778,28.545455,21.111111,26.090909,183.777778,184.272727,178.555556,185.363636,78.555556,77.454545,70.888889,75.272727,Burnley,Leeds United,1,1,4-4-2,,-3.0,0.0,-3.0,-3.0,-1.000000,-1.000000,-4.0,1.0,-4.0,-1.333333,0.333333,1.0,3,6.000000,6.000000,0.000000,5.972222,5.972222,-0.027778,0.0,-0.027778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2677,2677,79,61,4.698162,74.545455,4.309402,8.629755,77,75,74.545455,80.0,59.0,8.724168,82,61,5.007267,73.454545,4.294431,8.565845,76,73,73.454545,75.0,62.0,4.884784,23.444444,24.636364,23.333333,26.454545,179.555556,182.909091,178.222222,182.000000,74.222222,74.727273,70.222222,75.090909,Southampton,Leeds United,0,2,4-4-2,,-16.0,43.0,1.0,7.0,0.142857,1.000000,4.0,53.0,4.0,0.571429,1.142857,8.0,37,61.500000,42.500000,16.500000,56.500000,37.500000,11.500000,0.0,0.000000
2684,2684,80,69,3.376389,76.000000,4.329812,8.715804,78,73,76.000000,77.0,65.0,4.496030,76,61,5.211875,71.181818,4.262654,8.431565,75,69,71.181818,82.0,62.0,6.207075,29.750000,27.090909,25.000000,24.636364,183.375000,182.636364,178.333333,182.818182,80.750000,75.818182,72.111111,75.000000,Burnley,Leeds United,0,4,4-4-2,,-14.0,39.0,2.0,6.0,0.285714,0.857143,0.0,50.0,1.0,0.142857,1.142857,8.0,36,67.000000,51.000000,24.000000,63.333333,47.333333,20.333333,0.0,0.000000
2792,2792,76,61,4.965334,70.636364,4.255178,8.399638,74,69,70.636364,75.0,55.0,7.921490,79,61,4.941476,75.272727,4.318925,8.671367,78,75,75.272727,80.0,59.0,7.483315,24.666667,24.363636,23.444444,25.636364,179.777778,184.454545,179.444444,182.909091,74.000000,76.181818,72.333333,77.727273,Leeds United,Southampton,3,0,,4-4-2,-3.0,32.0,1.0,9.0,0.142857,1.285714,-9.0,30.0,-14.0,-2.000000,0.142857,1.0,25,53.714286,39.714286,22.714286,53.857143,39.857143,22.857143,0.0,0.000000
2893,2893,82,62,4.948829,73.090909,4.289549,8.544759,76,71,73.090909,75.0,59.0,6.023104,82,62,5.363174,75.818182,4.325871,8.702106,79,74,75.818182,77.0,57.0,6.691620,24.111111,25.454545,26.222222,27.727273,177.444444,183.636364,186.000000,183.181818,72.000000,75.181818,78.888889,78.727273,Leeds United,Burnley,1,0,,4-4-2,-6.0,17.0,-3.0,6.0,-0.428571,0.857143,-11.0,13.0,-3.0,-0.428571,1.142857,8.0,15,30.291667,25.291667,11.291667,30.458333,25.458333,11.458333,0.0,0.000000
