In [51]:
import pandas as pd
import numpy as np

# Loading the dataset

In [52]:
df = pd.read_csv("./dataset/dataset.csv", parse_dates=['date'])

df

Unnamed: 0,season,date,home,away,fulltime_home_goals,fulltime_away_goals,fulltime_result,halftime_home_goals,halftime_away_goals,halftime_result,...,home_fouls_committed,away_fouls_committed,home_fouls_won,away_fouls_won,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,home_xg,away_xg
0,2017_2018,2017-08-18,Leganes,Alaves,1,0,H,1,0,H,...,17,17,16,17,0,1,0,0,1.3,1.1
1,2017_2018,2017-08-18,Valencia,Las Palmas,1,0,H,1,0,H,...,25,13,13,24,3,2,0,1,1.4,0.2
2,2017_2018,2017-08-19,Celta Vigo,Real Sociedad,2,3,A,1,1,D,...,12,11,10,11,3,1,0,0,1.8,2.1
3,2017_2018,2017-08-19,Girona,Atletico Madrid,2,2,D,2,0,H,...,15,15,14,15,2,4,0,1,2.2,0.7
4,2017_2018,2017-08-19,Sevilla,Espanyol,1,1,D,1,1,D,...,14,12,12,14,2,4,1,0,2.4,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2117,2022_2023,2023-02-26,Athletic Bilbao,Girona,2,3,A,1,3,A,...,17,8,7,17,5,5,0,0,1.6,1.2
2118,2022_2023,2023-02-26,Celta Vigo,Real Valladolid,3,0,H,2,0,H,...,6,5,4,6,1,3,0,1,1.8,0.5
2119,2022_2023,2023-02-26,Almeria,Barcelona,1,0,H,1,0,H,...,13,11,11,13,3,3,0,0,1.0,1.1
2120,2022_2023,2023-02-26,Sevilla,Osasuna,2,3,A,0,1,A,...,10,15,15,9,4,2,1,0,2.2,1.0


# Dealing with missing values

The following features contain missing values:

In [53]:
isna_sum = df.isna().sum()
isna_features = isna_sum[isna_sum > 0]

isna_features

pinnaclesports_home_win_odds              3
pinnaclesports_draw_odds                  3
pinnaclesports_away_win_odds              3
pinnaclesports_closing_home_win_odds      1
pinnaclesports_closing_draw_odds          1
pinnaclesports_closing_away_win_odds      1
attendance                              487
dtype: int64

'attendance' (the total number of people attending the venue to watch the match) contains too many missing values and since it cannot be deducted from other features, the best strategy is to drop this feature.

In [54]:
df = df.drop('attendance', axis='columns')

As for the other columns including missing values, since the total number of instances are too few, those instances can safely be dropped.

In [55]:
df = df.dropna(axis='index')

In [57]:
df.columns

Index(['season', 'date', 'home', 'away', 'fulltime_home_goals',
       'fulltime_away_goals', 'fulltime_result', 'halftime_home_goals',
       'halftime_away_goals', 'halftime_result', 'bet365_home_win_odds',
       'bet365_draw_odds', 'bet365_away_win_odds', 'betandwin_home_win_odds',
       'betandwin_draw_odds', 'betandwin_away_win_odds',
       'interwetten_home_win_odds', 'interwetten_draw_odds',
       'interwetten_away_win_odds', 'pinnaclesports_home_win_odds',
       'pinnaclesports_draw_odds', 'pinnaclesports_away_win_odds',
       'williamhill_home_win_odds', 'williamhill_draw_odds',
       'williamhill_away_win_odds', 'vcbet_home_win_odds', 'vcbet_draw_odds',
       'vcbet_away_win_odds', 'pinnaclesports_closing_home_win_odds',
       'pinnaclesports_closing_draw_odds',
       'pinnaclesports_closing_away_win_odds', 'home_pre_rating',
       'home_rating_delta', 'home_post_rating', 'away_pre_rating',
       'away_rating_delta', 'away_post_rating', 'venue', 'home_possession',