In [119]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.feature_selection import mutual_info_classif

In [251]:
df = pd.read_csv("train_data.csv", index_col=0) # Load the training data

  df = pd.read_csv("train_data.csv", index_col=0) # Load the training data


In [268]:
examined_columns = ["FTR", # Dependent variable
                    "Attendance", "HS", "AS", "HST", "AST", "HHW", "AHW", "HC", "AC", 
                    "HF", "AF", "HFKC", "AFKC", "HO", "AO", "HY", "AY", "HR", "AR", "HBP", "ABP"]
X = df[examined_columns].copy()
if X["FTR"].dtype != np.float64:
    X["FTR"] = X["FTR"].map(
        {"D": 0, "A": -1, "H": 1}
    ) # Map categorical variables to numerical

In [269]:
X.isna().sum().sort_values(ascending=False)[:9]
# Some variables have a lot of missing values
# Our data has 153k entries, let's drop these variables

AFKC          153867
HFKC          153867
AO            148674
HHW           148666
AHW           148654
HO            148653
ABP           148502
Attendance    148497
HBP           148486
dtype: int64

In [270]:
X.drop(columns=["HFKC", "AFKC", "AO", "HO", "HHW", "AHW", "HBP", "ABP", "Attendance"], inplace = True)
X.isna().sum().sort_values(ascending=False)[:5]
# Dropping some columns now will not remove most of the data

HF     65013
AF     65004
HST    63882
AST    63855
HC     63377
dtype: int64

In [271]:
X.dropna(inplace=True)
y = X.pop("FTR") # Remove the dependent variable from the dataset

In [272]:
mutual_info_scores = mutual_info_classif(X, y)
for feature, score in zip(X.columns, mutual_info_scores):
    print(f"{feature}: {score}")
# We see that some variables have a very negligible effect on the dependent variable
# We can drop those as well

HS: 0.016418610620113627
AS: 0.016391956573126087
HST: 0.061069021464353
AST: 0.06120874609405291
HC: 0.00233621778902271
AC: 0.002256350139400398
HF: 0.0
AF: 0.002287879626687861
HY: 0.0072448686291139985
AY: 0.0024203048760282897
HR: 0.007968527624721888
AR: 0.006269654091008325


In [273]:
for feature in X.columns:
    print(f"Correlation between {feature} and target is {round(np.corrcoef(X[feature], y)[1][0], 2)}")
# Correlation analysis tells us, that there are some weakly colinear variables with the target

Correlation between HS and target is 0.17
Correlation between AS and target is -0.17
Correlation between HST and target is 0.31
Correlation between AST and target is -0.31
Correlation between HC and target is -0.02
Correlation between AC and target is 0.0
Correlation between HF and target is -0.0
Correlation between AF and target is 0.0
Correlation between HY and target is -0.08
Correlation between AY and target is 0.03
Correlation between HR and target is -0.13
Correlation between AR and target is 0.12


In [274]:
X.shape
# In this analysis, we have worked with roughly 88k entries

(88813, 12)

In [275]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)

model.score(X, y)
# Here we check the importance of the variables using a linear regression
# The R^2 is around 0.25, which means that we should choose more variables to include in future models

0.2492031165585853

In [276]:
list(dict(df.isna().sum().sort_values()).keys())[4:22]
# Some more candidate columns which don't have a lot of missing values

['FTHG',
 'Div',
 'FTAG',
 'HTAG',
 'HTHG',
 'Date',
 'HTR',
 'HomeTeam',
 'AwayTeam',
 'IWA',
 'IWD',
 'IWH',
 'WHD',
 'WHA',
 'WHH',
 'B365A',
 'B365H',
 'B365D']

In [277]:
more_columns = ["FTHG", "FTAG", "HTAG", "HTHG", "IWA", "IWD", "IWH", "WHD", "WHA", "WHH", "B365A", "B365H", "B365D"]
# Selecting columns from the previous cells
if df["FTR"].dtype != np.float64:
    df["FTR"] = df["FTR"].map(
        {"D": 0, "A": -1, "H": 1}
    ) # Map categorical variables to numerical
X = df[examined_columns + more_columns].copy() # Add previously used columns
X = X.drop(columns=["HFKC", "AFKC", "AO", "HO", "HHW", "AHW", "HBP", "ABP", "Attendance"]).copy() # Drop categorical variables
X.dropna(inplace=True)
y = X.pop("FTR")

In [278]:
model = LinearRegression()
model.fit(X, y)
model.score(X, y)
# This model's R^2 is 0.59, which is owing to more parameters

0.5901337834862573

In [264]:
X.columns
# These are the final used variables

Index(['HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR',
       'AR', 'FTHG', 'FTAG', 'HTAG', 'HTHG', 'IWA', 'IWD', 'IWH', 'WHD', 'WHA',
       'WHH', 'B365A', 'B365H', 'B365D'],
      dtype='object')