# Prediction #

After a first check on the data, using ordinal regressions and linear regressions, we can add new variables we've seen can help with prediction and use them with more complex models such as tree based or mlp models.

In [75]:
import pandas as pd
import numpy as np
from sklearn import feature_selection
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn import metrics
from sklearn.metrics import mean_squared_error,roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression,LinearRegression, PoissonRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, IsolationForest
from xgboost import XGBClassifier, XGBRegressor
import pickle
from difflib import SequenceMatcher
import seaborn as sns

In [76]:
def third_largest(row):
    sorted_row = sorted(row, reverse=True)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def third_smallest(row):
    sorted_row = sorted(row)
    return sorted_row[2] if len(sorted_row) >= 3 else None

def get_data_cols(df, att, prefix):
    '''
    Gets an attribute, and adds columns to show mean, max, min and sd (ignoring zeros) for the first num_players players.

    Parameters:
    df : The dataframe
    att: which attribute (e.g., Weight)
    prefix: HomePlayer or AwayPlayer
    '''

    player_weight_cols = [col for col in df.columns if col.startswith(f"{prefix}") and col.endswith(f"_{att}")]
    att = att.replace('(', ' ')
    if not player_weight_cols:
        print('no col with', att)
        return df

    # Select only the columns corresponding to the first num_players players
    player_weight_cols_subset = player_weight_cols[:11]  # Selecting the first 11 players

    # Replace zeros with NaN
    df[player_weight_cols_subset] = df[player_weight_cols_subset].replace(0, np.nan)


    if(att == 'Overall'):
        df[f"{prefix}_{att}_max"] = df[player_weight_cols_subset].max(axis=1, skipna=True)
        df[f"{prefix}_{att}_min"] = df[player_weight_cols_subset].min(axis=1, skipna=True)
        df[f"{prefix}_{att}_sd"] = df[player_weight_cols_subset].std(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_ln"] = np.log(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean_sqrt"] = np.sqrt(df[player_weight_cols_subset]).mean(axis=1, skipna=True)
        df[f"{prefix}_3rd_best"] = df[player_weight_cols_subset].apply(third_largest, axis=1)
        df[f"{prefix}_3rd_worst"] = df[player_weight_cols_subset].apply(third_smallest, axis=1)


    # For bench players (i >= 12)
    bench_weight_cols = player_weight_cols[11:]  # Selecting players from index 12 and onwards

    # Replace zeros with NaN for bench players
    df[bench_weight_cols] = df[bench_weight_cols].replace(0, np.nan)

    if(' ' in att): # 2 worded attributes so instead of dribbling total we will have dribbling
        df[f"{prefix}_bench_{att.split()[0]}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att.split()[0]}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    else:
        df[f"{prefix}_bench_{att}_mean"] = df[bench_weight_cols].mean(axis=1, skipna=True)
        df[f"{prefix}_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    if(att == 'Overall'):
        df[f"{prefix}_bench_{att}_max"] = df[bench_weight_cols].max(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_min"] = df[bench_weight_cols].min(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_sd"] = df[bench_weight_cols].std(axis=1, skipna=True)
        df[f"{prefix}_bench_{att}_mean"] = df[player_weight_cols_subset].mean(axis=1, skipna=True)

    return df
def find_most_similar_name(target_name, names_list, threshold=0.25):
    similarities = [(other_name, SequenceMatcher(None, target_name, other_name).ratio()) for other_name in names_list]
    if (len(similarities) == 0):
        return None, None
    most_similar_name, similarity_score = max(similarities, key=lambda x: x[1])
    
    if similarity_score >= threshold:
        #print('found player '+most_similar_name+" with a score of "+str(similarity_score))
        return most_similar_name, similarity_score
    else:
        print("didn't find", target_name)
        return None, None
def find_player(player_name, club_name,  df, attributes):
    #first we filter by club name
    temp = df[df['Club Name'] == club_name]

    #Now we find the player
    sim_name, full_score = find_most_similar_name(player_name, temp['Full Name'])
    sim_nickname, short_score = find_most_similar_name(player_name, temp['Known As'])
    if sim_name: # there is a full name
        ## if there is a nickname we have to check
        if sim_nickname and full_score > short_score:
            return temp[temp['Full Name'] == sim_name].iloc[0][attributes]
        elif sim_nickname:
            return temp[temp['Known As'] == sim_nickname].iloc[0][attributes]
        return temp[temp['Full Name'] == sim_name].iloc[0][attributes]
    if sim_nickname: # if there is a nickname but no full name
        return temp[temp['Known As'] == sim_nickname].iloc[0][attributes]
    return None


def ratings_col(df, att_df):
    '''
    puts for every player their 'Overall', 'Age', 'Height(in cm)', 'Weight(in kg)'
    for each player i we will have the column
    HomePlayeri (if the player is on the home team)
    AwayPlayeri (if the player is on the away team)
    and we will use the home_team_name or away_team_name and the full name as the key (we will use find_player(player_name, club_name,  df, attributes))
    '''

    attributes = [ 'Overall', 'Age', 'Height(in cm)', 'Weight(in kg)']
    # Iterate through each player column
    for i in range(1, 21):

        home_col = f'HomePlayer{i}'
        away_col = f'AwayPlayer{i}'
        if home_col in df.columns or away_col in df.columns:
            # Add columns for home team players
            for att in attributes:
                df[home_col + "_" + att] = df.apply(
                    lambda row: 0 if pd.isna(row[home_col]) else
                    (find_player(row[home_col], row['home_team_name'], att_df, attributes)[att]
                    if find_player(row[home_col], row['home_team_name'], att_df, attributes) is not None else None),
                    axis=1
                )

            for att in attributes:
                df[away_col + "_" + att] = df.apply(
                    lambda row: 0 if pd.isna(row[away_col]) else
                    (find_player(row[away_col], row['away_team_name'], att_df, attributes)[att]
                    if find_player(row[away_col], row['away_team_name'], att_df, attributes) is not None else None),
                    axis=1
                )
def replace_nas(df):
    '''
    gets df, goes to the Age column and replaces None with 18, and replaces Weight(in kg) and Height(in cm) with the mean, and overall with the min value
    '''

    # 'Dribbling Total', 'Pace Total', 'Defending Total', 'Shooting Total', 'Physicality Total', 'Passing Total' our new fatures
    attributes = ['Overall','Height(in cm)', 'Weight(in kg)']
    # First, deal with the None values
    for i in range(1,21):
        home_col = f'HomePlayer{i}'
        away_col = f'AwayPlayer{i}'
        if home_col in df.columns:
            # Age assumption
            df[home_col+"_Age"].fillna(18, inplace=True)

            for att  in attributes: 
                non_zero_col = df[home_col+'_'+att].replace(0, None)
                df[home_col+'_'+att].fillna(non_zero_col.min())
        if away_col in df.columns:
            # Age assumption
            df[away_col+"_Age"].fillna(18, inplace=True)

            for att  in attributes: 
                non_zero_col = df[away_col+'_'+att].replace(0, None)
                df[away_col+'_'+att].fillna(non_zero_col.min())
            # df[home_col+"_Age"].fillna(18, inplace=True)
            
            # # dealing with some ratings

            # df[home_col+"_Total"].fillna(18, inplace=True)
            # df[home_col+"_Dribbling Total"].fillna(df[home_col+"_Dribbling Total"].replace(0,None).min(), inplace=True)
            # df[home_col+"_Defending Total"].fillna(df[home_col+"_Defending Total"].replace(0,None).min(), inplace=True)
            # df[home_col+"_Shooting Total"].fillna(df[home_col+"_Shooting Total"].replace(0,None).min(), inplace=True)
            # df[home_col+"_Physicality Total"].fillna(df[home_col+"_Physicality Total"].replace(0,None).min(), inplace=True)
            # df[home_col+"_Pace Total"].fillna(df[home_col+"_Pace Total"].replace(0,None).min(), inplace=True)

            # # Build assumption (ignoring zeros)
            # non_zero_weights = df[home_col+"_Weight(in kg)"].replace(0, None)
            # non_zero_heights = df[home_col+"_Height(in cm)"].replace(0, None)
            # non_zero_overall = df[home_col+"_Overall"].replace(0, None)
            # df[home_col+"_Weight"].fillna(non_zero_weights.mean(), inplace=True)
            # df[home_col+"_Height"].fillna(non_zero_heights.mean(), inplace=True)
            
            
            # # Ratings assumption
            # df[home_col+"_Overall"].fillna(non_zero_overall.min(), inplace=True)
            
            # # And for away
            # # Age assumption
            # df[away_col+"_Age"].fillna(18, inplace=True)

            # df[away_col+"_Total"].fillna(18, inplace=True)
            # df[away_col+"_Dribbling Total"].fillna(df[away_col+"_Dribbling Total"].replace(0,None).min(), inplace=True)
            # df[away_col+"_Defending Total"].fillna(df[away_col+"_Defending Total"].replace(0,None).min(), inplace=True)
            # df[away_col+"_Shooting Total"].fillna(df[away_col+"_Shooting Total"].replace(0,None).min(), inplace=True)
            # df[away_col+"_Physicality Total"].fillna(df[away_col+"_Physicality Total"].replace(0,None).min(), inplace=True)
            # df[away_col+"_Pace Total"].fillna(df[away_col+"_Pace Total"].replace(0,None).min(), inplace=True)

            # # Build assumption (ignoring zeros)
            # non_zero_weights_away = df[away_col+"_Weight"].replace(0, None)
            # non_zero_heights_away = df[away_col+"_Height"].replace(0, None)
            # non_zero_overall_away = df[away_col+"_Overall"].replace(0, None)
            # df[away_col+"_Weight"].fillna(non_zero_weights_away.mean(), inplace=True)
            # df[away_col+"_Height"].fillna(non_zero_heights_away.mean(), inplace=True)
            
            # # Ratings assumption
            # df[away_col+"_Overall"].fillna(non_zero_overall_away.min(), inplace=True)
            

    # Dealing with zero values
    for i in range(1, 21):
        home_col = f'HomePlayer{i}'
        away_col = f'AwayPlayer{i}'
        if home_col in df.columns:
            for att in attributes:
                df[home_col+"_"+att].replace(0, None, inplace=True)
        if away_col in df.columns:
            for att in attributes:
                df[away_col+"_"+att].replace(0, None, inplace=True)
     
def set_season_cols(columns, season):
    for c in season.columns:
        for att in ['Overall', 'Age', 'Height(in cm)', 'Weight(in kg)']:
            if att in c:
                columns.append(c)
    


def pipeline(train_arr, fifa):
    train_df = pd.read_csv(train_arr, encoding='latin1')
    ratings_df = pd.read_csv(fifa,encoding='latin1')
    ratings_df.rename(columns={'long_name' : 'Full Name', 'height_cm':'Height(in cm)',
                          'weight_kg' : 'Weight(in kg)',
                          'age' : 'Age','club_name' :"Club Name",
                          'overall' : 'Overall', 'dribbling' : 'Dribbling Total',
                          'pace' : 'Pace Total', 'defending' : "Defending Total" ,
                          "shooting" : 'Shooting Total', 'physic' : 'Physicality Total',
                          'passing' : "Passing Total", 'short_name' : 'Known As'},inplace=True)
    #first we replace the names with the numbers
    # fix_html_col(ratings_df, 'away_fromation')
    # fix_html_col(ratings_df, 'home_formation')

    # Define a dictionary
    names_fix = {'Man Utd': "Manchester United",
             "Man City": 'Manchester City',
             "West Ham": "West Ham United",
             "Nott'm Forest": "Nottingham Forest",
             'Spurs': "Tottenham Hotspur",
             'Wolves': "Wolverhampton Wanderers",
             "Brighton and Hove Albion": "Brighton & Hove Albion",
             "Bournemouth": "AFC Bournemouth",
             'Newcastle': 'Newcastle United',
             'Leicester': 'Leicester City',
             'Leeds': "Leeds United",
             'Huddersfield' : 'Huddersfield Town',
             'Swansea' : 'Swansea City',
             'Cardiff': 'Cardiff City',
             'Norwich' : 'Norwich City',
             'Stoke' : 'Stoke City',
             'West Brom' : 'West Bromwich Albion',
             'Hull' : 'Hull City',
             'QPR' : 'Queens Park Rangers',
             'Sheffield Utd' : 'Sheffield United'}
    
 
    for key, val in names_fix.items():
        train_df['home_team_name'].replace(key, val, inplace=True)
        train_df['away_team_name'].replace(key, val, inplace=True)
    print("Finished Loading test")
    # print(train_df.columns)
    att_list = ['Overall']
    numerical_cols = []
    statistic_list = ['mean', 'min', 'max', 'sd', 'median', '3rd_best', '3rd_worst']
    #statistic_list = ['mean' ,'sd']
    cat_cols=['home_team_name', 'away_team_name', 'home_formation', 'away_fromation']

    
    att_list = ['Overall', 'Age', 'Height(in cm)', 'Weight(in kg)']
    for att in att_list:
        if att not in ratings_df.columns:
            print('fix',att)
    
    # Sanity Check
    ratings_df = ratings_df[['Full Name', 'Club Name', 'Known As']+ att_list]
    ratings_col(train_df, ratings_df)

    replace_nas(train_df)
    train_df['Matchweek'] = train_df['Matchweek'].str.split(' ').str[1].astype(int)

    final_cols = ['home_score', 'home_team_name', 'away_score', 'away_team_name', 'away_fromation', 'home_formation', 'home_GD_prior', 'away_GD_prior', 'home_Points_prior', 'away_Points_prior', 'Matchweek']
    set_season_cols(final_cols, train_df)

    
    #binary_cols = ['Home_Adv_Team', 'Strong_Away'] ## From ar
    

    for att in att_list:
        get_data_cols(train_df, att, prefix='HomePlayer')
        get_data_cols(train_df, att, prefix='AwayPlayer')
    
    for col in train_df.columns:
        for s in statistic_list:
            if s in col:
                numerical_cols.append(col)

    cat_cols=['Unnamed: 0', 'home_team_name', 'away_team_name', 'home_formation', 
              'away_fromation', 'home_GD_prior', 
              'home_Points_prior', 'home_GD_form', 'home_Points_form', 'home_GD_form_pw', 'home_Points_form_pw',
                'away_GD_prior', 'away_Points_prior', 'away_GD_form', 'away_GD_form_pw', 'away_Points_form_pw',
                'away_Points_form','Matchweek', 'home_points_to_championship',
                'home_points_to_ucl','home_points_to_rel','away_points_to_championship',
                'away_points_to_ucl','away_points_to_rel', 'home_match_importance', 'away_match_importance', 'B365A', 'B365D', 'B365H', 'HtA']
    return train_df[numerical_cols + cat_cols]


In [77]:
from scipy.stats import skellam

def extract_poisson_probas(home_pred, away_pred):
    '''
    creates a probability matrix according to the regression

    Parameters:
    home_pred: expected home score
    away_pred: expected away score
    '''

    probability_matrix = np.zeros((len(home_pred), 3))  # 3 columns (-1, 0, 1 probabilities)
    
    for i in range(len(home_pred)):
        # Check for NaN values and skip the calculation if found
        if np.isnan(home_pred[i]) or np.isnan(away_pred[i]):
            continue

        home_rounded = home_pred[i]
        away_rounded = away_pred[i]
                
        probability_matrix[i, 0] = skellam.cdf(-1, home_rounded, away_rounded)
        probability_matrix[i, 1] = skellam.pmf(0, home_rounded, away_rounded)
        probability_matrix[i, 2] = skellam.sf(0, home_rounded, away_rounded)
        if(probability_matrix[i, 0] == np.nan or probability_matrix[i, 1] == np.nan or probability_matrix[i, 2] == np.nan):
            print("NAN PRODUCED IN INDEX", i)
        
    return probability_matrix

In [78]:
def get_lineups_and_predict(match, fifa):
    train = pipeline(match, fifa)
    home_reg = XGBRegressor()
    home_reg.load_model("home_regression.json")
    features = home_reg.get_booster().feature_names
    away_reg = XGBRegressor()
    away_reg.load_model("away_regression.json")
    fixtures = list(train['home_team_name'] + ' V '+train['away_team_name'])
    print(fixtures)
    #pre processing
    train.drop(['home_formation', 'away_fromation'], axis=1, inplace=True)
    train["Home_min_max"] = train['HomePlayer_Overall_max'] * train['HomePlayer_Overall_min']
    train['Away_min_max'] = train['AwayPlayer_Overall_max'] * train['AwayPlayer_Overall_min']
    with open('encoder', 'rb') as file:
        enc = pickle.load(file)
    
    # One-hot encode 'home_team_name'
    encoded = enc.transform(train[['home_team_name', 'away_team_name']])
    encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(['home_team_name', 'away_team_name']))
    train = pd.concat([encoded_df, train], axis=1)

    train.drop(['home_team_name', 'away_team_name'], axis=1, inplace=True)
    train = train[features]
    for col in train.columns:
        train[col] = train[col].astype(float)
    home_pred = home_reg.predict(train)
    away_pred = away_reg.predict(train)
    print("Home expected goals:", home_pred)
    print("Away expected goals:", away_pred)
    probas = extract_poisson_probas(home_pred, away_pred)
    print('Expected outcomes:')
    outcomes = probas * np.asarray(train[['B365A', 'B365D', 'B365H']])
    print("win outcomes", outcomes[:,2])
    print("tie outcomes", outcomes[:,1])
    print("loss outcomes", outcomes[:,0])
    df = pd.DataFrame({'Fixture' : fixtures,
                       "Home Expected Goals": home_pred,
                       "Away Expected Goals": away_pred,
                       'Home Expected' : outcomes[:, 2],
                       'Draw Expected' : outcomes[:, 1],
                       'Away Expected' : outcomes[:, 0],
                       "Home Proba" : probas[:, 2],
                       "Away Proba" : probas[:, 0],
                       "Draw Probas" : probas[:, 1],
                       "Best bet by sqrt (0 - away, 1 - draw, 2 - home team)": np.argmax(probas*np.sqrt(np.asarray(train[['B365A', 'B365D', 'B365H']])), 
                       axis = 1),
                       "Best bet by ln  (0 - away, 1 - draw, 2 - home team)": np.argmax(probas*np.log(np.asarray(train[['B365A', 'B365D', 'B365H']])), 
                       axis = 1)})
    df.to_csv("predictions.csv")
    return probas



SyntaxError: invalid syntax (<ipython-input-78-e2afba917b38>, line 45)

In [None]:
get_lineups_and_predict('format.csv', 'fifa_season.csv')

  ratings_df = pd.read_csv(fifa,encoding='latin1')


Finished Loading test
['Sheffield United V Burnley', 'Luton V Brentford', 'Wolverhampton Wanderers V Arsenal', 'Everton V Nottingham Forest', 'Aston Villa V AFC Bournemouth', 'Crystal Palace V West Ham United', 'Fulham V Liverpool']
Home expected goals: [1.0903704 1.3886279 0.8556589 1.3109288 1.9698511 1.3338624 1.1285495]
Away expected goals: [1.2843288  1.2843288  1.8714763  0.98008406 0.98008406 1.0520478
 1.8714763 ]
Expected outcomes:
win outcomes [0.84905542 1.26236791 1.66454752 0.90563166 1.11749969 0.88400343
 1.19388323]
tie outcomes [0.99615748 0.97230163 1.10166222 0.97774261 0.85600171 1.0279774
 1.01412031]
loss outcomes [1.00162031 0.72708214 0.81557303 1.0039443  0.6550045  1.00181514
 0.85915675]


array([[0.40882462, 0.27671041, 0.31446497],
       [0.34622959, 0.25928043, 0.39448997],
       [0.6132128 , 0.22033244, 0.16645475],
       [0.27887342, 0.27935503, 0.44177154],
       [0.1819457 , 0.21400043, 0.60405389],
       [0.29465151, 0.27412731, 0.43122118],
       [0.5472336 , 0.22536007, 0.22740633]])