## Second round of processing ##

Here we will do another round of processing only this time we will take each of the values of the players, so we have for each attribute:
mean, min, max, sd.

Later we will split our modeling work into 2 ways:
1. Emphasis on explainability, using an RMD file
2. Emphasis on prediciton, using a jupyter notebook file

In [5]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Since we already processed for each season seperatly the ratings, there will be no problems unifying them

In [6]:
def third_largest(row):
    sorted_row = sorted(row, reverse=True)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def third_smallest(row):
    sorted_row = sorted(row)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def get_data_cols(df, att, prefix):
    '''
    Gets an attribute, and adds columns to show mean, max, min and sd (ignoring zeros) for the first num_players players.

    Parameters:
    df : The dataframe
    att: which attribute (e.g., Weight)
    prefix: HomePlayer or AwayPlayer
    '''

    player_weight_cols = [col for col in df.columns if col.startswith(f"{prefix}") and col.endswith(f"_{att}")]
    att = att.replace('(', ' ')
    if not player_weight_cols:
        print('no col with', att)
        return df

    # Select only the columns corresponding to the first num_players players
    player_weight_cols_subset = player_weight_cols[:11]  # Selecting the first 11 players

    # Replace zeros with NaN
    df[player_weight_cols_subset] = df[player_weight_cols_subset].replace(0, np.nan)


    if(att == 'Overall'):
        df[f"{prefix}_{att}_max"] = df[player_weight_cols_subset].max(axis=1, skipna=True)
        df[f"{prefix}_{att}_min"] = df[player_weight_cols_subset].min(axis=1, skipna=True)
        df[f"{prefix}_{att}_sd"] = df[player_weight_cols_subset].std(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_ln"] = np.log(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_sqrt"] = np.sqrt(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_3rd_best"] = df[player_weight_cols_subset].apply(third_largest, axis=1)
        df[f"{prefix}_3rd_worst"] = df[player_weight_cols_subset].apply(third_smallest, axis=1)


    # For bench players (i >= 12)
    bench_weight_cols = player_weight_cols[11:]  # Selecting players from index 12 and onwards

    # Replace zeros with NaN for bench players
    df[bench_weight_cols] = df[bench_weight_cols].replace(0, np.nan)

    if(' ' in att): # 2 worded attributes so instead of dribbling total we will have dribbling
        df[f"{prefix}_bench_{att.split()[0]}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att.split()[0]}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    else:
        df[f"{prefix}_bench_{att}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    if(att == 'Overall'):
        df[f"{prefix}_bench_{att}_max"] = df[bench_weight_cols].max(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_min"] = df[bench_weight_cols].min(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_sd"] = df[bench_weight_cols].std(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    return df

In [7]:
def pipeline(train_arr, test_arr):
    train_df = pd.read_csv(train_arr[0])
    test_df = pd.read_csv(test_arr[0])
    print('started loading train and test')
    for i in range (1, len(train_arr)):
        train_df = pd.concat([train_df, pd.read_csv(train_arr[i])], ignore_index=True, axis=0)
    print("Finished loading train")
    for i in range (1, len(test_arr)):
        test_df = pd.concat([test_df, pd.read_csv(test_arr[i])], ignore_index=True, axis=0)
    print("Finished Loading test")
    att_list = ['Overall']
    numerical_cols = []
    statistic_list = ['mean', 'min', 'max', 'sd', 'median', '3rd_best', '3rd_worst']
    #statistic_list = ['mean' ,'sd']
    #binary_cols = ['Home_Adv_Team', 'Strong_Away'] ## From ar
    

    for att in att_list:
        get_data_cols(train_df, att, prefix='HomePlayer')
        get_data_cols(train_df, att, prefix='AwayPlayer')
        get_data_cols(test_df, att, prefix='HomePlayer')
        get_data_cols(test_df, att, prefix='AwayPlayer')
    

    for col in train_df.columns:
        for s in statistic_list:
            if s in col:
                numerical_cols.append(col)

    cat_cols=['home_team_name', 'away_team_name', 'home_score', 'away_score', 'home_GD_prior', 
              'home_Points_prior', 'home_GD_form', 'home_Points_form',
                'away_GD_prior', 'away_Points_prior', 'away_GD_form', 'league',
                'away_Points_form','Matchweek', 'B365A', 'B365D', 'B365H']

    train_df[numerical_cols + cat_cols].to_csv("train.csv")
    test_df[numerical_cols + cat_cols].to_csv("test.csv")

In [8]:
train_years = ['1415','1516','1617', '1718','1819','1920','2021','2122']
test_years = ['2223']
train = []
test = []
for trainy in train_years:
    train.append('season_data/season'+trainy+'_proccessed.csv')
for testy in test_years:
    test.append('season_data/season'+testy+'_proccessed.csv')
pipeline(train,
              test)

started loading train and test
Finished loading train
Finished Loading test


KeyError: "['leagueaway_Points_form'] not in index"