In [3]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression, BayesianRidge, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, get_scorer_names
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import resample
from sklearn.decomposition import PCA, TruncatedSVD
from category_encoders.target_encoder import TargetEncoder

In [4]:
# Get the dataset
file_path = 'data/soccer_data.dta'
csv_path = 'data/soccer_data.csv'
df_all = pd.read_stata(file_path)
df_all.to_csv(csv_path)

In [5]:
df_all['competition'].unique()

array(['Euro 2016', 'Premier League 2017-18', 'Bundesliga 2017-18',
       'World Cup 2018'], dtype=object)

In [12]:
# Choose the ratings
df_kik = df_all[df_all['rat'] == 'Kicker']
df_who = df_all[df_all['rat'] == 'WhoScored']
df_sof = df_all[df_all['rat'] == 'SofaScore'] # not enough data to do anything useful
df_grd = df_all[df_all['rat'] == 'The Guardian'] # not enough data to do anything useful
df_sky = df_all[df_all['rat'] == 'SkySports']
df_bld = df_all[df_all['rat'] == 'Bild'] # not enough data to do anything useful
print(df_kik.shape)
print(df_who.shape)
print(df_sof.shape)
print(df_grd.shape)
print(df_sky.shape)
print(df_bld.shape)

(8913, 128)
(17132, 128)
(2390, 128)
(566, 128)
(9236, 128)
(5452, 128)


array(['Euro 2016', 'Premier League 2017-18', 'Bundesliga 2017-18',
       'World Cup 2018'], dtype=object)

In [7]:
def clean_columns(df):
    # drop some columns
    useless_columns = ['v1', 'rating', 'team_rating', 'kicker', 'bild', 'skysports', 'goalkeeper', 'defender', 'midfielder', 'forward']
    redundant_columns = ['competition_id', 'match_id', 'team_id', 'player', 'win', 'lost', 'position']
    nonfeature_columns = ['player_id', 'team_pos_rating', 'team_rating_original', 'past_performances', 'rat', 'is_human']
    different_encoded_columns = ['match', 'date']
    df_clean = df.drop(columns=useless_columns + redundant_columns + nonfeature_columns + different_encoded_columns)

    def get_match_result(match:str):
        result = match[-5:]
        result = result.split(" - ")
        if result[0] == result[1]:
            return 0

        if result[0] > result[1]:
            return 1

        if result[0] < result[1]:
            return -1

    # add columns
    df_clean['result'] = df['match'].map(get_match_result)

    # date
    date = pd.to_datetime(df['date'], format="%d/%m/%Y")
    df_clean['weekday'] = date.dt.weekday
    # df_clean['day'] = date.dt.day
    df_clean['month'] = date.dt.month
    return df_clean


def split_dataset(df):
    # Split the df into X and y
    X = df.drop(columns=['original_rating'])
    y = df['original_rating']

    return X, y

In [8]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))


def create_pipeline(df, model):
    # define feature types
    numeric_features = set(df.select_dtypes(
        exclude=["category", "object"]).columns)
    categorical_features = set(df.select_dtypes(
        include=['category', "object"]).columns)
    cyclic_features = {'weekday', 'month'}
    team_feature = {'team'}
    
    numeric_features -= cyclic_features
    categorical_features -= cyclic_features
    categorical_features -= team_feature

    numeric_features = list(numeric_features)
    categorical_features = list(categorical_features)
    cyclic_features = list(cyclic_features)
    team_feature = list(team_feature)

    # print(numeric_features)
    # print(categorical_features)
    # print(cyclic_features)

    # transformer for numeric features
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
            ("scaler", RobustScaler()),
        ]
    )

    # transformer for categorical features
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ("ohe", OneHotEncoder(handle_unknown='ignore', sparse=False)),
        ]
    )

    # preprocessing transformer, applies different transformations on different features
    preprocessor = ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, numeric_features),
            ("categorical", categorical_transformer, categorical_features),
            ("team", TargetEncoder(handle_missing='value', handle_unknown='value'), team_feature),
            ("month_sin", sin_transformer(12), ["month"]),
            ("month_cos", cos_transformer(12), ["month"]),
            ("weekday_sin", sin_transformer(7), ["weekday"]),
            ("weekday_cos", cos_transformer(7), ["weekday"]),
        ]
    )

    # final pipeline preprocessing + classifier
    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            # ('pca', PCA(n_components='mle')),
            # ('svd', TruncatedSVD(n_components=100)),
            ("classifier", model),
        ]
    )

    return pipe


In [9]:
df_temp = clean_columns(df_kik)
X_temp, y_temp = split_dataset(df_temp)
preprocessor = create_pipeline(X_temp, None)
preprocessor.steps.pop(-1)
temp = preprocessor.fit_transform(X_temp, y_temp)
temp = pd.DataFrame(temp)
temp.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,114,115,116,117,118,119,120,121,122,123
count,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,...,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0,8913.0
mean,0.222475,0.071188,0.239538,0.23045,0.151464,2.828415,0.80736,0.140918,0.068608,0.140806,...,0.017166,0.091439,0.139347,0.033771,0.033883,3.481544,0.02551867,-0.09615961,-0.564855,-0.03126
std,0.856265,0.611601,0.556057,0.829333,0.47721,5.779227,1.309415,0.613014,0.423885,0.34784,...,0.129897,0.288249,0.346328,0.180649,0.180938,0.227052,0.6720358,0.7338808,0.629104,0.533166
min,-0.692308,-0.5,0.0,-0.8,0.0,-1.228898,0.0,0.0,-0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,3.036364,-1.0,-1.0,-0.974928,-0.900969
25%,-0.384615,-0.5,0.0,-0.4,0.0,-0.011445,0.0,0.0,-0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,3.341463,-0.5,-0.8660254,-0.974928,-0.222521
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.465217,1.224647e-16,-1.83697e-16,-0.781831,-0.222521
75%,0.615385,0.5,0.0,0.6,0.0,0.988555,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,3.560606,0.8660254,0.5,-0.433884,0.62349
max,7.307692,3.5,5.0,5.4,6.0,24.11588,13.0,8.0,0.5,1.0,...,1.0,1.0,1.0,1.0,1.0,4.30303,1.0,1.0,0.974928,1.0


In [10]:
class Bootstrap:
    def __init__(self, nr):
        self.nr = nr
    
    def split(self, X, y, groups=None):
        idx = range(len(X))
        splits = []
        for i in range(self.nr):
            train = resample(idx, replace=True, n_samples=len(X), random_state=i)
            test = list(set(idx) - set(train))
            splits.append((train, test))
        return splits

In [11]:
def train_and_test(df):
    # create datasets
    df = clean_columns(df)
    X, y = split_dataset(df)

    # create the pipeline
    model = Ridge()
    # model = Lasso(alpha=0.01)
    pipe = create_pipeline(X, model)

    # do cross validation
    s = cross_validate(
        pipe,
        X,
        y,
        cv=Bootstrap(10),
        scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'],
        n_jobs=-1,
    )

    # evaluate
    r2_scores = s['test_r2']
    mae_scores = s['test_neg_mean_absolute_error'] * -1
    mse_scores = s['test_neg_mean_squared_error'] * -1

    r2 = np.percentile(r2_scores, 95)
    mae = np.percentile(mae_scores, 95)

    # sigma2
    mse = np.percentile(mse_scores, 95)
    sigma2 = mse * (X.shape[0] / (X.shape[0] - X.shape[1] + 1))

    print(f'r2: {r2:0.3f}, mae: {mae:0.3f}, sigma2: {sigma2:0.3f}, error: {(mae + sigma2) * 2:0.3f}')


train_and_test(df_kik)
train_and_test(df_who)
train_and_test(df_sky)


r2: 0.521, mae: 0.520, sigma2: 0.435, error: 1.909
r2: 0.765, mae: 0.278, sigma2: 0.131, error: 0.818
r2: 0.439, mae: 0.604, sigma2: 0.599, error: 2.407
