In [64]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

pd.set_option('display.max_columns', 150)


In [65]:
training_dataset = pd.read_csv("../data/training_clean.csv", index_col=0)
testing_dataset = pd.read_csv("../data/testing_clean.csv", index_col=0)

In [66]:
dataset = pd.concat([training_dataset, testing_dataset])
dataset

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,Max>2.5,Max<2.5,AHh,MaxAHH,MaxAHA,PSH,PSD,PSA,PSCH,PSCD,PSCA
0,SP1,2008-08-30,Espanol,Valladolid,1,0,H,0,0,D,10,11,2,1,18,17,1,9,3,5,0,0,2.00,3.30,3.80,1.80,3.25,4.10,1.75,3.20,4.30,1.83,3.20,3.75,1.90,3.25,3.50,2.09,3.40,4.50,2.12,1.83,0.00,1.46,3.00,1.90,3.25,3.99,1.90,3.25,3.99
1,SP1,2008-08-30,Valencia,Mallorca,3,0,H,2,0,H,17,16,6,2,17,14,5,6,4,0,0,0,1.70,3.60,5.25,1.65,3.35,5.00,1.70,3.30,4.50,1.67,3.30,4.50,1.65,3.40,4.50,1.75,3.88,5.40,2.00,1.93,-0.50,1.76,2.21,1.70,3.42,4.85,1.70,3.42,4.85
2,SP1,2008-08-31,Ath Bilbao,Almeria,1,3,A,0,2,A,10,11,4,5,35,20,2,6,2,4,0,0,2.00,3.30,3.80,1.90,3.20,3.80,2.00,3.00,3.60,1.91,3.20,3.50,1.90,3.20,3.60,2.10,3.40,4.00,2.25,1.70,0.00,1.47,2.75,1.96,3.23,3.70,1.96,3.23,3.70
3,SP1,2008-08-31,Ath Madrid,Malaga,4,0,H,3,0,H,25,7,9,2,16,13,11,7,1,2,0,0,1.44,4.20,7.50,1.40,3.80,7.95,1.45,3.60,7.00,1.40,3.75,7.00,1.45,3.75,7.00,1.50,4.70,9.00,1.80,2.11,0.00,1.14,6.50,1.44,3.92,7.42,1.44,3.92,7.42
4,SP1,2008-08-31,Betis,Recreativo,0,1,A,0,1,A,8,13,2,6,17,18,2,2,3,1,0,0,2.00,3.25,3.80,1.75,3.30,4.40,1.90,3.10,3.80,1.80,3.25,3.80,1.75,3.25,4.35,2.10,3.36,4.60,2.25,1.80,0.00,1.40,3.25,1.86,3.26,4.15,1.86,3.26,4.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,SP1,2019-12-22,Leganes,Espanol,2,0,H,1,0,H,11,9,5,3,11,15,1,5,1,6,0,0,2.10,3.10,3.90,2.10,3.10,4.00,2.10,3.05,4.00,2.15,3.10,3.80,2.10,3.10,3.90,2.20,3.20,4.10,2.88,1.49,-0.25,1.85,2.13,2.18,3.12,3.94,2.13,3.03,4.27
176,SP1,2019-12-22,Osasuna,Sociedad,3,4,A,1,3,A,19,13,7,7,11,12,8,3,6,2,1,0,2.80,3.40,2.50,2.80,3.10,2.70,2.80,3.25,2.55,2.80,3.30,2.55,2.75,3.30,2.55,2.88,3.44,2.75,2.11,1.86,0.00,2.06,1.88,2.82,3.39,2.61,3.17,3.45,2.34
177,SP1,2019-12-22,Betis,Ath Madrid,1,2,A,0,0,D,15,8,2,5,11,14,5,3,4,3,0,0,3.80,3.30,2.10,3.80,3.40,2.05,3.80,3.30,2.05,3.80,3.30,2.05,3.75,3.30,2.05,4.00,3.42,2.10,2.25,1.77,0.50,1.87,2.08,3.92,3.39,2.07,4.17,3.50,1.98
178,SP1,2019-12-22,Levante,Celta,3,1,H,0,1,A,13,12,7,5,8,15,8,7,3,6,0,0,2.62,3.40,2.62,2.55,3.50,2.70,2.65,3.45,2.60,2.62,3.60,2.55,2.60,3.60,2.50,2.73,3.70,2.75,1.78,2.21,0.00,2.00,1.97,2.73,3.54,2.60,2.64,3.54,2.68


# <h1 style='font-size:30px;'>Encoding</h1>

In this step, we transform categorical team names and matches into numerical representations for machine learning models. Methods:

1. **Label Encoding**: Assigns a unique integer to result of each match.
2. **One-Hot Encoding**: Creates binary columns for each team, with a `1` indicating the presence of a particular team.

In [67]:
def team_encode(df):
    all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    team_encoded = one_hot_encoder.fit_transform(df[['HomeTeam', 'AwayTeam']])
    team_encoded_df = pd.DataFrame(team_encoded, columns=one_hot_encoder.get_feature_names_out(['HomeTeam', 'AwayTeam']))
    df = pd.concat([df.reset_index(drop=True), team_encoded_df.reset_index(drop=True)], axis=1)
    return df, team_encoded_df

def match_encode(df):
    label_encoder = LabelEncoder()
    df["FTR_encoded"] = label_encoder.fit_transform(df["FTR"])
    return df

In [68]:
dataset, team_encoded_df = team_encode(dataset)
dataset = match_encode(dataset)

Let's look at the match encoded. We see:

0: Away team wins

1: Draw

2: Home team wins

In [69]:
dataset[["FTR_encoded", "FTR"]]

Unnamed: 0,FTR_encoded,FTR
0,2,H
1,2,H
2,0,A
3,2,H
4,0,A
...,...,...
4355,2,H
4356,0,A
4357,0,A
4358,2,H


# <h1 style='font-size:30px;'>Feature Engineering</h1>

In [70]:
def team_last_matches_performance(df, team, date, number_of_matches):
    past_n_matches = df.loc[((df["HomeTeam"] == team) | (df["AwayTeam"] == team)) & (df["Date"] < date), :].tail(number_of_matches)

    goal_scored = (past_n_matches.loc[past_n_matches["HomeTeam"] == team, "FTHG"].sum() + 
                past_n_matches.loc[past_n_matches["AwayTeam"] == team, "FTAG"].sum()
    )

    goals_conceded = (past_n_matches.loc[past_n_matches["HomeTeam"] == team, "FTAG"].sum() + 
                past_n_matches.loc[past_n_matches["AwayTeam"] == team, "FTHG"].sum()
    )

    avg_goal_diff = (goal_scored - goals_conceded) / number_of_matches

    points = 0
    for _, match in past_n_matches.iterrows():
        if ((match["HomeTeam"] == team) and (match["FTR_encoded"] == 2)) or (
            (match["AwayTeam"] == team) and (match["FTR_encoded"] == 0)
        ):
            points += 3
        elif ((match["HomeTeam"] == team) and (match["FTR_encoded"] == 0)) or (
            (match["AwayTeam"] == team) and (match["FTR_encoded"] == 2)
        ):
            points += 0
        else:
            points += 1

    shot_on_target = (past_n_matches.loc[past_n_matches["HomeTeam"] == team, "HST"].sum() +
                    past_n_matches.loc[past_n_matches["AwayTeam"] == team, "AST"].sum()
    ) / number_of_matches
    
    return avg_goal_diff, points, shot_on_target

In [71]:
dataset_columns = dataset.columns.to_list()

dataset[["HomeTeam_avg_goal_diff", "HomeTeam_points", "HomeTeam_ShotOnTarget"]] = dataset.apply(
    lambda row: pd.Series(
        team_last_matches_performance(dataset, row["HomeTeam"], row["Date"], 5)
    ),
    axis=1
)


dataset[["AwayTeam_avg_goal_diff", "AwayTeam_points", "AwayTeam_ShotOnTarget"]] = dataset.apply(
    lambda row: pd.Series(
        team_last_matches_performance(dataset, row["AwayTeam"], row["Date"], 5)
    ),
    axis=1
)


dataset = dataset[dataset_columns[:dataset_columns.index("FTHG")]
                                    +['HomeTeam_avg_goal_diff', 'HomeTeam_points', "HomeTeam_ShotOnTarget", "AwayTeam_avg_goal_diff", "AwayTeam_points", "AwayTeam_ShotOnTarget"] 
                                    + dataset_columns[dataset_columns.index("FTHG"):]]


In [72]:
def normalize_betting_odd(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: 1/x)
    normalization_factor = df[columns].sum(axis=1)
    for col in columns:
        df[col] = df[col] / normalization_factor
    return df

In [73]:
betting_comapnies = []
for index in range(dataset.columns.get_loc("B365H"), dataset.columns.get_loc("MaxH"), 3):
    betting_comapnies.append(dataset.columns[index:index+3].tolist())

In [74]:
for betting_odd in betting_comapnies:
    dataset = normalize_betting_odd(dataset, betting_odd)

In [75]:
dataset

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,HomeTeam_avg_goal_diff,HomeTeam_points,HomeTeam_ShotOnTarget,AwayTeam_avg_goal_diff,AwayTeam_points,AwayTeam_ShotOnTarget,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,Max>2.5,Max<2.5,AHh,MaxAHH,MaxAHA,PSH,PSD,PSA,PSCH,PSCD,PSCA,HomeTeam_Alaves,HomeTeam_Almeria,HomeTeam_Ath Bilbao,HomeTeam_Ath Madrid,HomeTeam_Barcelona,HomeTeam_Betis,HomeTeam_Celta,HomeTeam_Cordoba,HomeTeam_Eibar,HomeTeam_Elche,HomeTeam_Espanol,HomeTeam_Getafe,HomeTeam_Girona,HomeTeam_Granada,HomeTeam_Hercules,HomeTeam_Huesca,HomeTeam_La Coruna,HomeTeam_Las Palmas,HomeTeam_Leganes,HomeTeam_Levante,HomeTeam_Malaga,HomeTeam_Mallorca,HomeTeam_Numancia,HomeTeam_Osasuna,HomeTeam_Real Madrid,HomeTeam_Recreativo,HomeTeam_Santander,HomeTeam_Sevilla,HomeTeam_Sociedad,HomeTeam_Sp Gijon,HomeTeam_Tenerife,HomeTeam_Valencia,HomeTeam_Valladolid,HomeTeam_Vallecano,HomeTeam_Villarreal,HomeTeam_Xerez,HomeTeam_Zaragoza,AwayTeam_Alaves,AwayTeam_Almeria,AwayTeam_Ath Bilbao,AwayTeam_Ath Madrid,AwayTeam_Barcelona,AwayTeam_Betis,AwayTeam_Celta,AwayTeam_Cordoba,AwayTeam_Eibar,AwayTeam_Elche,AwayTeam_Espanol,AwayTeam_Getafe,AwayTeam_Girona,AwayTeam_Granada,AwayTeam_Hercules,AwayTeam_Huesca,AwayTeam_La Coruna,AwayTeam_Las Palmas,AwayTeam_Leganes,AwayTeam_Levante,AwayTeam_Malaga,AwayTeam_Mallorca,AwayTeam_Numancia,AwayTeam_Osasuna,AwayTeam_Real Madrid,AwayTeam_Recreativo,AwayTeam_Santander,AwayTeam_Sevilla,AwayTeam_Sociedad,AwayTeam_Sp Gijon,AwayTeam_Tenerife,AwayTeam_Valencia,AwayTeam_Valladolid,AwayTeam_Vallecano,AwayTeam_Villarreal,AwayTeam_Xerez,AwayTeam_Zaragoza,FTR_encoded
0,SP1,2008-08-30,Espanol,Valladolid,0.0,0.0,0.0,0.0,0.0,0.0,1,0,H,0,0,D,10,11,2,1,18,17,1,9,3,5,0,0,0.468960,0.284218,0.246821,0.501789,0.277914,0.220297,0.511810,0.279896,0.208295,0.485466,0.277626,0.236908,0.470041,0.274793,0.255165,2.09,3.40,4.50,2.12,1.83,0.00,1.46,3.00,1.90,3.25,3.99,1.90,3.25,3.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
1,SP1,2008-08-30,Valencia,Mallorca,0.0,0.0,0.0,0.0,0.0,0.0,3,0,H,2,0,H,17,16,6,2,17,14,5,6,4,0,0,0,0.556783,0.262925,0.180292,0.548686,0.270248,0.181066,0.528282,0.272145,0.199573,0.532716,0.269587,0.197697,0.539968,0.262043,0.197988,1.75,3.88,5.40,2.00,1.93,-0.50,1.76,2.21,1.70,3.42,4.85,1.70,3.42,4.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,SP1,2008-08-31,Ath Bilbao,Almeria,0.0,0.0,0.0,0.0,0.0,0.0,1,3,A,0,2,A,10,11,4,5,35,20,2,6,2,4,0,0,0.468960,0.284218,0.246821,0.477612,0.283582,0.238806,0.450000,0.300000,0.250000,0.466725,0.278576,0.254699,0.471358,0.279869,0.248773,2.10,3.40,4.00,2.25,1.70,0.00,1.47,2.75,1.96,3.23,3.70,1.96,3.23,3.70,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,SP1,2008-08-31,Ath Madrid,Malaga,0.0,0.0,0.0,0.0,0.0,0.0,4,0,H,3,0,H,25,7,9,2,16,13,11,7,1,2,0,0,0.651526,0.223380,0.125093,0.647450,0.238534,0.114016,0.621149,0.250185,0.128667,0.635593,0.237288,0.127119,0.627428,0.242605,0.129967,1.50,4.70,9.00,1.80,2.11,0.00,1.14,6.50,1.44,3.92,7.42,1.44,3.92,7.42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,SP1,2008-08-31,Betis,Recreativo,0.0,0.0,0.0,0.0,0.0,0.0,0,1,A,0,1,A,8,13,2,6,17,18,2,2,3,1,0,0,0.466919,0.287335,0.245747,0.518664,0.275049,0.206287,0.473282,0.290076,0.236641,0.493211,0.273163,0.233626,0.515262,0.277449,0.207289,2.10,3.36,4.60,2.25,1.80,0.00,1.40,3.25,1.86,3.26,4.15,1.86,3.26,4.15,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4355,SP1,2019-12-22,Leganes,Espanol,-0.2,5.0,3.8,-1.2,2.0,2.8,2,0,H,1,0,H,11,9,5,3,11,15,1,5,1,6,0,0,0.451288,0.305711,0.243001,0.454046,0.307580,0.238374,0.451768,0.311054,0.237178,0.442608,0.306970,0.250423,0.451288,0.305711,0.243001,2.20,3.20,4.10,2.88,1.49,-0.25,1.85,2.13,2.18,3.12,3.94,2.13,3.03,4.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4356,SP1,2019-12-22,Osasuna,Sociedad,-0.2,5.0,4.0,0.2,6.0,4.4,3,4,A,1,3,A,19,13,7,7,11,12,8,3,6,2,1,0,0.339728,0.279776,0.380496,0.340106,0.307192,0.352702,0.337886,0.291102,0.371012,0.339383,0.287961,0.372656,0.343434,0.286195,0.370370,2.88,3.44,2.75,2.11,1.86,0.00,2.06,1.88,2.82,3.39,2.61,3.17,3.45,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4357,SP1,2019-12-22,Betis,Ath Madrid,0.4,10.0,5.4,0.6,8.0,5.0,1,2,A,0,0,D,15,8,2,5,11,14,5,3,4,3,0,0,0.252459,0.290710,0.456831,0.251806,0.281431,0.466763,0.249677,0.287507,0.462816,0.249677,0.287507,0.462816,0.252167,0.286553,0.461280,4.00,3.42,2.10,2.25,1.77,0.50,1.87,2.08,3.92,3.39,2.07,4.17,3.50,1.98,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4358,SP1,2019-12-22,Levante,Celta,-1.0,6.0,4.4,-0.4,5.0,5.6,3,1,H,0,1,A,13,12,7,5,8,15,8,7,3,6,0,0,0.360934,0.278132,0.360934,0.374109,0.272565,0.353325,0.358764,0.275572,0.365663,0.362946,0.264144,0.372910,0.362027,0.261464,0.376508,2.73,3.70,2.75,1.78,2.21,0.00,2.00,1.97,2.73,3.54,2.60,2.64,3.54,2.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [76]:
dataset = dataset.drop(columns=["FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR", "HF", "AF", "HY", "AY", "HR", "AR"])

In [77]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns_to_scale = dataset.loc[:, "HomeTeam_avg_goal_diff":"AwayTeam_ShotOnTarget"].columns
dataset[columns_to_scale] = dataset[columns_to_scale].astype(float)
dataset.loc[:, columns_to_scale] = scaler.fit_transform(dataset[columns_to_scale])

In [78]:
dataset = dataset.drop(columns=["HS", "AS", "HST", "AST", "HC", "AC", "Max>2.5", "Max<2.5", "AHh", "MaxAHH", "MaxAHA"])

In [79]:
dataset

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,HomeTeam_avg_goal_diff,HomeTeam_points,HomeTeam_ShotOnTarget,AwayTeam_avg_goal_diff,AwayTeam_points,AwayTeam_ShotOnTarget,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,PSH,PSD,PSA,PSCH,PSCD,PSCA,HomeTeam_Alaves,HomeTeam_Almeria,HomeTeam_Ath Bilbao,HomeTeam_Ath Madrid,HomeTeam_Barcelona,HomeTeam_Betis,HomeTeam_Celta,HomeTeam_Cordoba,HomeTeam_Eibar,HomeTeam_Elche,HomeTeam_Espanol,HomeTeam_Getafe,HomeTeam_Girona,HomeTeam_Granada,HomeTeam_Hercules,HomeTeam_Huesca,HomeTeam_La Coruna,HomeTeam_Las Palmas,HomeTeam_Leganes,HomeTeam_Levante,HomeTeam_Malaga,HomeTeam_Mallorca,HomeTeam_Numancia,HomeTeam_Osasuna,HomeTeam_Real Madrid,HomeTeam_Recreativo,HomeTeam_Santander,HomeTeam_Sevilla,HomeTeam_Sociedad,HomeTeam_Sp Gijon,HomeTeam_Tenerife,HomeTeam_Valencia,HomeTeam_Valladolid,HomeTeam_Vallecano,HomeTeam_Villarreal,HomeTeam_Xerez,HomeTeam_Zaragoza,AwayTeam_Alaves,AwayTeam_Almeria,AwayTeam_Ath Bilbao,AwayTeam_Ath Madrid,AwayTeam_Barcelona,AwayTeam_Betis,AwayTeam_Celta,AwayTeam_Cordoba,AwayTeam_Eibar,AwayTeam_Elche,AwayTeam_Espanol,AwayTeam_Getafe,AwayTeam_Girona,AwayTeam_Granada,AwayTeam_Hercules,AwayTeam_Huesca,AwayTeam_La Coruna,AwayTeam_Las Palmas,AwayTeam_Leganes,AwayTeam_Levante,AwayTeam_Malaga,AwayTeam_Mallorca,AwayTeam_Numancia,AwayTeam_Osasuna,AwayTeam_Real Madrid,AwayTeam_Recreativo,AwayTeam_Santander,AwayTeam_Sevilla,AwayTeam_Sociedad,AwayTeam_Sp Gijon,AwayTeam_Tenerife,AwayTeam_Valencia,AwayTeam_Valladolid,AwayTeam_Vallecano,AwayTeam_Villarreal,AwayTeam_Xerez,AwayTeam_Zaragoza,FTR_encoded
0,SP1,2008-08-30,Espanol,Valladolid,0.375,0.000000,0.000000,0.439024,0.000000,0.000000,0.468960,0.284218,0.246821,0.501789,0.277914,0.220297,0.511810,0.279896,0.208295,0.485466,0.277626,0.236908,0.470041,0.274793,0.255165,2.09,3.40,4.50,1.90,3.25,3.99,1.90,3.25,3.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
1,SP1,2008-08-30,Valencia,Mallorca,0.375,0.000000,0.000000,0.439024,0.000000,0.000000,0.556783,0.262925,0.180292,0.548686,0.270248,0.181066,0.528282,0.272145,0.199573,0.532716,0.269587,0.197697,0.539968,0.262043,0.197988,1.75,3.88,5.40,1.70,3.42,4.85,1.70,3.42,4.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,SP1,2008-08-31,Ath Bilbao,Almeria,0.375,0.000000,0.000000,0.439024,0.000000,0.000000,0.468960,0.284218,0.246821,0.477612,0.283582,0.238806,0.450000,0.300000,0.250000,0.466725,0.278576,0.254699,0.471358,0.279869,0.248773,2.10,3.40,4.00,1.96,3.23,3.70,1.96,3.23,3.70,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,SP1,2008-08-31,Ath Madrid,Malaga,0.375,0.000000,0.000000,0.439024,0.000000,0.000000,0.651526,0.223380,0.125093,0.647450,0.238534,0.114016,0.621149,0.250185,0.128667,0.635593,0.237288,0.127119,0.627428,0.242605,0.129967,1.50,4.70,9.00,1.44,3.92,7.42,1.44,3.92,7.42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,SP1,2008-08-31,Betis,Recreativo,0.375,0.000000,0.000000,0.439024,0.000000,0.000000,0.466919,0.287335,0.245747,0.518664,0.275049,0.206287,0.473282,0.290076,0.236641,0.493211,0.273163,0.233626,0.515262,0.277449,0.207289,2.10,3.36,4.60,1.86,3.26,4.15,1.86,3.26,4.15,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4355,SP1,2019-12-22,Leganes,Espanol,0.350,0.333333,0.322034,0.292683,0.133333,0.237288,0.451288,0.305711,0.243001,0.454046,0.307580,0.238374,0.451768,0.311054,0.237178,0.442608,0.306970,0.250423,0.451288,0.305711,0.243001,2.20,3.20,4.10,2.18,3.12,3.94,2.13,3.03,4.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4356,SP1,2019-12-22,Osasuna,Sociedad,0.350,0.333333,0.338983,0.463415,0.400000,0.372881,0.339728,0.279776,0.380496,0.340106,0.307192,0.352702,0.337886,0.291102,0.371012,0.339383,0.287961,0.372656,0.343434,0.286195,0.370370,2.88,3.44,2.75,2.82,3.39,2.61,3.17,3.45,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4357,SP1,2019-12-22,Betis,Ath Madrid,0.425,0.666667,0.457627,0.512195,0.533333,0.423729,0.252459,0.290710,0.456831,0.251806,0.281431,0.466763,0.249677,0.287507,0.462816,0.249677,0.287507,0.462816,0.252167,0.286553,0.461280,4.00,3.42,2.10,3.92,3.39,2.07,4.17,3.50,1.98,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4358,SP1,2019-12-22,Levante,Celta,0.250,0.400000,0.372881,0.390244,0.333333,0.474576,0.360934,0.278132,0.360934,0.374109,0.272565,0.353325,0.358764,0.275572,0.365663,0.362946,0.264144,0.372910,0.362027,0.261464,0.376508,2.73,3.70,2.75,2.73,3.54,2.60,2.64,3.54,2.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [80]:
pivot_date = testing_dataset["Date"][0]
X_train = dataset.loc[dataset["Date"] < pivot_date, :].drop(columns = "FTR_encoded")
X_test = dataset.loc[dataset["Date"] >= pivot_date, :].drop(columns = "FTR_encoded")

y_train = dataset.loc[dataset["Date"] < pivot_date, ["FTR_encoded"]]
y_test = dataset.loc[dataset["Date"] >= pivot_date, ["FTR_encoded"]]

In [81]:
X_train.to_csv("../data/X_train.csv")
y_train.to_csv("../data/y_train.csv")
X_test.to_csv("../data/X_test.csv")
y_test.to_csv("../data/y_test.csv")