In [93]:
import numpy as np
import pandas as pd
import data_preprocessing as prep
import importlib

importlib.reload(prep)

<module 'data_preprocessing' from 'c:\\Users\\bmk1bj\\Documents\\GIT_repositories\\AIMatch\\data_preprocessing.py'>

In [102]:
dataset = prep.Dataset()
X_train, Y_train, X_val, Y_val, X_test = dataset.get_input_data(label_weights=[2, 0.5])

Number of relevant labeled matches: 18532/44060
X shape =  (18532, 457)
Y shape =  (18532, 2)
X shape =  (48, 457)
Y shape =  (48, 2)


In [103]:
def score_sample(y_pred, y_true, label_weights):
    """ 
    Scoring according to the AI match rules
    y_pred = predictions
    y_true = ground truth 
    """
    
    y = np.around(y_pred / label_weights)
    y_gt = np.around(y_true / label_weights)
    #print(y, y_pred, y_true, y_gt)

    if y[0] == y_gt[0] and y[1] == y_gt[1]:
        return 4
    
    if y[0] == y_gt[0] and not y[0] == 0:
        return 3

    if (y[0] >= 0 and y_gt[0] >= 0) or (y[0] < 0 and y_gt[0] < 0):
        return 2   

    return 0

def score(Y, Y_gt, label_weights=[1, 1]):
    return np.sum([score_sample(Y[i], Y_gt[i], np.array(label_weights)) for i in range(len(Y))])

# Reference values
# = total score for validation data if results are hard-coded and all same without any prediction
# all models should overcome those values
max_score = 4 * len(Y_val)
ref_score_1 = score(np.zeros(Y_val.shape) * dataset.label_weights, Y_val, label_weights=dataset.label_weights) # 0:0
print("0:0", ref_score_1, "/", max_score, " - %s points per match" % (np.round(ref_score_1/len(Y_val), 2)))
ref_score_2 = score(np.ones(Y_val.shape) * dataset.label_weights, Y_val, label_weights=dataset.label_weights) # 1:0
print("1:0", ref_score_2, "/", max_score, " - %s points per match" % (np.round(ref_score_2/len(Y_val), 2)))
Y_pred = np.ones(Y_val.shape) * dataset.label_weights # 1:1
Y_pred[:, 0] = 0
ref_score_3 = score(Y_pred, Y_val, label_weights=dataset.label_weights) 
print("1:1", ref_score_3, "/", max_score, " - %s points per match" % (np.round(ref_score_3/len(Y_val), 2)))
Y_pred = np.ones(Y_val.shape) * dataset.label_weights # 0:1
Y_pred[:, 0] = -1
ref_score_3 = score(Y_pred, Y_val, label_weights=dataset.label_weights) 
print("0:1", ref_score_3, "/", max_score, " - %s points per match" % (np.round(ref_score_3/len(Y_val), 2)))
Y_pred = np.ones(Y_val.shape) # 2:1
Y_pred[:, 1] = 2
ref_score_3 = score(Y_pred * dataset.label_weights, Y_val, label_weights=dataset.label_weights) 
print("2:1", ref_score_3, "/", max_score, " - %s points per match" % (np.round(ref_score_3/len(Y_val), 2)))
Y_pred = 2 * np.ones(Y_val.shape) # 2:0
ref_score_3 = score(Y_pred * dataset.label_weights, Y_val, label_weights=dataset.label_weights) 
print("2:0", ref_score_3, "/", max_score, " - %s points per match" % (np.round(ref_score_3/len(Y_val), 2)))

0:0 8856 / 22240  - 1.59 points per match
1:0 9656 / 22240  - 1.74 points per match
1:1 9020 / 22240  - 1.62 points per match
0:1 9020 / 22240  - 1.62 points per match
2:1 9460 / 22240  - 1.7 points per match
2:0 9033 / 22240  - 1.62 points per match


In [104]:
def prediction_to_goals(y_weighted, label_weights):
    """ Encoding back to the exact result 
    [goal_diff, home_goals] -> [home_goals, away_goals]
    """
    y = y_weighted / np.array(label_weights)
    return np.hstack([y[1], y[1] - y[0]])

def predictions_to_goals(Y, label_weights):
    """ Encoding back to the exact result for whole data set
    [goal_diff, home_goals] -> [home_goals, away_goals]
    """
    return np.vstack([prediction_to_goals(Y[i], label_weights) for i in range(len(Y))])

In [107]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer



def show_predictions(X, Y, Y_pred, indexes, label_weights):
    for i in indexes:
        x = X[i]
        home_team_part = x[:len(dataset.all_teams)]
        away_team_part = x[len(dataset.all_teams):2*len(dataset.all_teams)]
        home_team = dataset.all_teams[np.where(home_team_part == 1)[0][0]]
        away_team = dataset.all_teams[np.where(away_team_part == 1)[0][0]]
        print(home_team, " x ", away_team, ": ", prediction_to_goals(Y_pred[i], label_weights), "-", prediction_to_goals(Y[i], label_weights), " ...................  output (weighted): ", Y_pred[i], Y[i], "   original: ", Y_pred[i]/label_weights, Y[i]/label_weights)


def fit_simple_regressor(X_train, Y_train, X_val, Y_val, regressor, name: str, label_weights: list, show_predicted_indexes = [0, 1]):
    regressor.fit(X_train, Y_train,  )
    Y_val_pred = regressor.predict(X_val)
    Y_test_pred = regressor.predict(X_test)
    reg_score = score(np.round(Y_val_pred), Y_val, label_weights=label_weights)
    print(name)
    print(reg_score, "/", max_score, " - %s points per match" % (np.round(reg_score/len(Y_val), 2)))
    
    print("Val:")
    show_predictions(X_val, Y_val, Y_val_pred, show_predicted_indexes, label_weights)
    print("Test:")
    show_predictions(X_test, np.zeros(Y_test_pred.shape), Y_test_pred, show_predicted_indexes, label_weights)
    

models = {
    #"Tree": DecisionTreeRegressor(random_state=0),
    "Linear": LinearRegression(),
    "MLP": MLPRegressor(hidden_layer_sizes = (128, 32, 8), activation="relu")
}


trainable = 0.2
start = int((1 - trainable) * len(X_train))
for name, model in models. items():
    fit_simple_regressor(X_train[start:], Y_train[start:], X_val, Y_val, model, name, show_predicted_indexes = [0, 1, 2, 3, 4, 5, 6, 7, 8], label_weights=dataset.label_weights)


Linear
9524 / 22240  - 1.71 points per match
Val:
Morocco  x  Zimbabwe :  [2.24121094 2.03613281] - [1. 0.]  ...................  output (weighted):  [0.41015625 1.12060547] [2.  0.5]    original:  [0.20507812 2.24121094] [1. 1.]
Senegal  x  DR Congo :  [ 2.47753906 -0.48535156] - [0. 0.]  ...................  output (weighted):  [5.92578125 1.23876953] [0. 0.]    original:  [2.96289062 2.47753906] [0. 0.]
Tunisia  x  Ghana :  [1.15527344 0.38183594] - [2. 0.]  ...................  output (weighted):  [1.546875   0.57763672] [4. 1.]    original:  [0.7734375  1.15527344] [2. 2.]
Morocco  x  Angola :  [2.5546875  0.63671875] - [2. 2.]  ...................  output (weighted):  [3.8359375  1.27734375] [0. 1.]    original:  [1.91796875 2.5546875 ] [0. 2.]
Saudi Arabia  x  Sweden :  [1.12695312 2.30273438] - [1. 1.]  ...................  output (weighted):  [-2.3515625   0.56347656] [0.  0.5]    original:  [-1.17578125  1.12695312] [0. 1.]
United Arab Emirates  x  South Korea :  [2.14550781 