In [2]:
import numpy as np
import pandas as pd
import time
import datetime
from matplotlib import pyplot as plt
import re
import importlib

import data_preprocessing as prep

importlib.reload(prep)

<module 'data_preprocessing' from 'c:\\Users\\Uzivatel\\Documents\\Py\\AIMatch\\data_preprocessing.py'>

#### Ideas:

- how to prefer recent matches?

### Load and preprocess data

In [3]:
df = prep.load_raw_data()
df = prep.preprocess_data(df, 
        drop_columns=["city", "country", "tournament", "neutral", "date"], 
        strip_columns=["home_team", "away_team", "tournament"])
display(df)

Unnamed: 0,home_team,away_team,home_score,away_score,tournament_group,home_advantage
0,Scotland,England,0.0,0.0,Friendly,1
1,England,Scotland,4.0,2.0,Friendly,1
2,Scotland,England,2.0,1.0,Friendly,1
3,England,Scotland,2.0,2.0,Friendly,1
4,Scotland,England,3.0,0.0,Friendly,1
...,...,...,...,...,...,...
44119,Winners Match 51,Winners Match 52,,,World Cup,0
44120,Winners Match 57,Winners Match 58,,,World Cup,0
44121,Winners Match 59,Winners Match 60,,,World Cup,0
44122,Losers Match 61,Losers Match 62,,,World Cup,0


### Split labeled and unlabeled data

In [10]:
df_labeled = df[df.home_score.notnull()]
df_test = df[df.home_score.isnull()]

In [14]:
def is_test_team(match, teams):
    return match["home_team"] in teams or match["away_team"] in teams

teams_test = set([team for team in set(df_test.home_team.unique().tolist() + df_test.away_team.unique().tolist()) if not ("group" in team.lower() or "match" in team.lower())])
print(teams_test, len(teams_test))

df_labeled_testonly = df_labeled[df_labeled.apply(lambda x: is_test_team(x, teams_test), axis=1)]

print("Number of relevant matches: ", len(df_labeled_testonly))

{'Costa Rica', 'Ecuador', 'Argentina', 'South Korea', 'England', 'Tunisia', 'Japan', 'United States', 'Saudi Arabia', 'Morocco', 'Denmark', 'France', 'Wales', 'Brazil', 'Cameroon', 'Canada', 'Uruguay', 'Senegal', 'Portugal', 'Croatia', 'Netherlands', 'Germany', 'Serbia', 'Ghana', 'Belgium', 'Spain', 'Mexico', 'Qatar', 'Poland', 'Switzerland', 'Iran', 'Australia'} 32
Number of relevant matches:  18532


### Data vectorization

In [19]:
def onehot_encode(all_teams, team):
    onehot = np.zeros(len(all_teams))
    onehot[all_teams.index(team)] = 1
    return onehot

all_teams = sorted(list(teams_test))

team_to_onehot = dict((team, onehot_encode(all_teams, team)) for team in all_teams)
print(team_to_onehot)

{'Argentina': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Australia': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Belgium': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Brazil': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Cameroon': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Canada': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Costa Rica': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0