In [1]:
import gc, warnings
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from keras.layers import Dense,Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split,KFold, GroupKFold, StratifiedKFold
import tensorflow as tf
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv("../input/zindi-football-league-winners-prediction-challenge/Train.csv")
test = pd.read_csv("../input/zindi-football-league-winners-prediction-challenge/Test.csv")
stats = pd.read_csv("../input/zindi-football-league-winners-prediction-challenge/game_statistics.csv")
display(train.head())
display(stats.head())

Unnamed: 0,Date,Season,Match_ID,Game_ID,Home Team,Away Team,Score
0,2017-02-15,1,1.0,ID_KAG4KAE9,Antennae,Andromeda,Away win
1,2016-09-14,1,12.0,ID_T7H1XN5Z,Andromeda,Antennae,Draw
2,2016-12-12,1,13.0,ID_O1P9FGN4,Andromeda,Butterfly,Away win
3,2017-01-25,1,14.0,ID_TR5M4KQV,Andromeda,Cigar,Away win
4,2016-08-24,1,15.0,ID_EOIECRE2,Andromeda,Circinus,Home Win


Unnamed: 0,Game_ID,Player_ID,id,X,Y,Team,Action,Half,Manager,Opposition_Team,Shots,SoT,Accurate passes,Inaccurate passes,Passes,Start_minutes,End_minutes,next_player,next_action,next_x,next_y,event_id,next_team,next_event_id,xt_value
0,ID_ZZKYYPVY,Player_O7HO1NOB,1.0,52.5,34.0,Circinus,Accurate passes,1st half,Ryoichi,Backward,0.0,0.0,1.0,0.0,1.0,0.03,0.37,,,,,,,,
1,ID_ZZKYYPVY,Player_O7HO1NOB,2.0,52.5,34.0,Circinus,Positional attacks,1st half,Ryoichi,Backward,0.0,0.0,0.0,0.0,0.0,0.03,0.37,,,,,,,,
2,ID_ZZKYYPVY,Player_J0V7B0YE,3.0,37.0,32.7,Circinus,Positional attacks,1st half,Ryoichi,Backward,0.0,0.0,0.0,0.0,0.0,0.05,0.38,,,,,,,,
3,ID_ZZKYYPVY,Player_J0V7B0YE,4.0,39.5,33.0,Circinus,Positional attacks,1st half,Ryoichi,Backward,0.0,0.0,0.0,0.0,0.0,0.07,0.4,,,,,,,,
4,ID_ZZKYYPVY,Player_J0V7B0YE,5.0,39.5,33.0,Circinus,Inaccurate passes,1st half,Ryoichi,Backward,0.0,0.0,0.0,1.0,1.0,0.07,0.4,,,,,,,,


In [3]:
stats['diff'] = stats['End_minutes'] - stats['Start_minutes']
stats['mult'] = stats['End_minutes'] * stats['Start_minutes']

players = stats.groupby('Game_ID')['Player_ID'].nunique().reset_index()

ids = stats.groupby('Game_ID')['id'].nunique().reset_index() 

passes = stats.groupby('Game_ID')[['Shots', 'SoT', 'Accurate passes', 'Inaccurate passes', 'Passes']].sum().reset_index()

coordinates_time = stats.groupby('Game_ID')[['diff', 'mult', 'X', 'Y', 'Start_minutes', 'End_minutes']].mean().reset_index()

total_time = stats.groupby('Game_ID')[['diff']].std().reset_index()
total_time.columns = ['Game_ID', 'diff2']

temp = pd.get_dummies(stats['Half'])
stats['1st half'] = temp['1st half'].values
stats['2nd half'] = temp['2nd half'].values
half = stats.groupby('Game_ID')[['1st half', '2nd half']].sum().reset_index()

teams = stats.drop_duplicates(subset = "Game_ID")[['Game_ID', 'Team', 'Opposition_Team']].reset_index(drop = True)

In [4]:
all_data = pd.concat([train, test]).reset_index(drop = True)
all_data = all_data.merge(players, on = 'Game_ID')
all_data = all_data.merge(ids, on = 'Game_ID')
all_data = all_data.merge(passes, on = 'Game_ID')
all_data = all_data.merge(coordinates_time, on = 'Game_ID')
all_data = all_data.merge(total_time, on = 'Game_ID')
all_data = all_data.merge(half, on = 'Game_ID')
all_data = all_data.merge(teams, on = 'Game_ID')
all_data.head()

Unnamed: 0,Date,Season,Match_ID,Game_ID,Home Team,Away Team,Score,Player_ID,id,Shots,SoT,Accurate passes,Inaccurate passes,Passes,diff,mult,X,Y,Start_minutes,End_minutes,diff2,1st half,2nd half,Team,Opposition_Team
0,2017-02-15,1,1.0,ID_KAG4KAE9,Antennae,Andromeda,Away win,27,3726,13.0,3.0,698.0,150.0,848.0,0.334498,2916.610068,53.146189,34.24643,46.014536,46.349034,0.023472,1899.0,1827.0,Andromeda,Antennae
1,2016-09-14,1,12.0,ID_T7H1XN5Z,Andromeda,Antennae,Draw,28,4199,15.0,1.0,740.0,165.0,905.0,0.333777,3347.431727,53.393379,33.421267,49.9813,50.315077,0.016295,1988.0,2211.0,Andromeda,Antennae
2,2016-12-12,1,13.0,ID_O1P9FGN4,Andromeda,Butterfly,Away win,27,4291,9.0,2.0,833.0,224.0,1057.0,0.333123,3011.997486,51.870077,35.331531,46.435824,46.768947,0.012552,2143.0,2148.0,Butterfly,Andromeda
3,2017-01-25,1,14.0,ID_TR5M4KQV,Andromeda,Cigar,Away win,25,3779,16.0,5.0,634.0,196.0,830.0,0.333477,3161.256018,52.166817,33.89566,48.187952,48.521429,0.012344,1846.0,1933.0,Cigar,Andromeda
4,2016-08-24,1,15.0,ID_EOIECRE2,Andromeda,Circinus,Home Win,28,4422,18.0,2.0,888.0,220.0,1108.0,0.333991,3023.191802,51.065966,34.477951,47.232379,47.56637,0.022252,2144.0,2278.0,Andromeda,Circinus


In [5]:
map_team = {}
for index, team in enumerate(all_data['Team'].unique()):
    map_team[team] = index
    
all_data['Home Team'] = all_data['Home Team'].map(map_team)
all_data['Away Team'] = all_data['Away Team'].map(map_team)
all_data['Opposition_Team'] = all_data['Opposition_Team'].map(map_team)
all_data['Team'] = all_data['Team'].map(map_team)

all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Year'] = all_data['Date'].dt.year
all_data['Month'] = all_data['Date'].dt.month
all_data['Day'] = all_data['Date'].dt.day
del all_data['Date']

game_le = LabelEncoder()
all_data['Game_ID'] = game_le.fit_transform(all_data['Game_ID'].values)

In [6]:
score_map = {'Home Win': 0, 'Away win': 1, 'Draw': 2}
train = all_data[all_data['Score'].isnull() == False].reset_index(drop = True)
test = all_data[all_data['Score'].isnull() == True].reset_index(drop = True)
test = test.fillna(0)
train['Score'] = train['Score'].map(score_map)
del test['Score']
train.head()

Unnamed: 0,Season,Match_ID,Game_ID,Home Team,Away Team,Score,Player_ID,id,Shots,SoT,Accurate passes,Inaccurate passes,Passes,diff,mult,X,Y,Start_minutes,End_minutes,diff2,1st half,2nd half,Team,Opposition_Team,Year,Month,Day
0,1,1.0,353,8,0,1,27,3726,13.0,3.0,698.0,150.0,848.0,0.334498,2916.610068,53.146189,34.24643,46.014536,46.349034,0.023472,1899.0,1827.0,0,8,2017,2,15
1,1,12.0,519,0,8,2,28,4199,15.0,1.0,740.0,165.0,905.0,0.333777,3347.431727,53.393379,33.421267,49.9813,50.315077,0.016295,1988.0,2211.0,0,8,2016,9,14
2,1,13.0,415,0,1,1,27,4291,9.0,2.0,833.0,224.0,1057.0,0.333123,3011.997486,51.870077,35.331531,46.435824,46.768947,0.012552,2143.0,2148.0,1,0,2016,12,12
3,1,14.0,527,0,2,1,25,3779,16.0,5.0,634.0,196.0,830.0,0.333477,3161.256018,52.166817,33.89566,48.187952,48.521429,0.012344,1846.0,1933.0,2,0,2017,1,25
4,1,15.0,250,0,16,0,28,4422,18.0,2.0,888.0,220.0,1108.0,0.333991,3023.191802,51.065966,34.477951,47.232379,47.56637,0.022252,2144.0,2278.0,0,16,2016,8,24


In [7]:
def get_kfold_preds(model):
    n_splits = 10
    seed = 42
    cols = list(train.columns[(train.columns != 'Game_ID') & (train.columns != 'Match_ID') & (train.columns != 'Player_ID') & (train.columns != 'Opposition_Team') & (train.columns != 'Score')])
    test_preds, oof_predictions = None, np.zeros((len(train), 3))

    skf = StratifiedKFold(n_splits = n_splits , shuffle = True , random_state = seed)
    for fold, (tr_index , val_index) in enumerate(skf.split(train[cols].values , train['Score'].values)):

        print("-" * 50)
        print(f"Fold {fold + 1}")

        x_train,x_val = train[cols].values[tr_index] , train[cols].values[val_index]
        y_train,y_val = train['Score'].values[tr_index] , train['Score'].values[val_index]
        
        eval_set = [(x_val, y_val)]
        try:
            model.fit(x_train, y_train, eval_set = eval_set, verbose = False)
        except:
            model.fit(x_train, y_train)
                      
        train_preds = model.predict(x_train)
        print("Training Accuracy : " , accuracy_score(y_train ,train_preds))

        val_preds = model.predict(x_val)
        print("Validation Accuracy : " , accuracy_score(y_val , val_preds))

        oof_predictions[val_index] = model.predict_proba(x_val) #val_preds.reshape(-1)

        if test_preds is None:
            test_preds = model.predict_proba(test[cols].values)
        else:
            test_preds += model.predict_proba(test[cols].values)

    test_preds /= n_splits    

    print("-" * 50)
    print("OOF Accuracy :" , accuracy_score(train['Score'], np.argmax(oof_predictions, axis = -1)))
    print("-" * 50)

    return test_preds, oof_predictions

# Model 1

In [8]:
lgbm = LGBMClassifier()
lgbm_preds, lgbm_oof = get_kfold_preds(lgbm)

--------------------------------------------------
Fold 1
Training Accuracy :  1.0
Validation Accuracy :  0.5238095238095238
--------------------------------------------------
Fold 2
Training Accuracy :  1.0
Validation Accuracy :  0.47619047619047616
--------------------------------------------------
Fold 3
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366
--------------------------------------------------
Fold 4
Training Accuracy :  1.0
Validation Accuracy :  0.5609756097560976
--------------------------------------------------
Fold 5
Training Accuracy :  1.0
Validation Accuracy :  0.5365853658536586
--------------------------------------------------
Fold 6
Training Accuracy :  1.0
Validation Accuracy :  0.5609756097560976
--------------------------------------------------
Fold 7
Training Accuracy :  1.0
Validation Accuracy :  0.5365853658536586
--------------------------------------------------
Fold 8
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366

In [9]:
print(classification_report(train['Score'].values, np.argmax(lgbm_oof, axis = -1), target_names = ['Home Win', 'Away win', 'Draw']))

              precision    recall  f1-score   support

    Home Win       0.52      0.63      0.57       164
    Away win       0.48      0.35      0.41       113
        Draw       0.68      0.65      0.67       135

    accuracy                           0.56       412
   macro avg       0.56      0.55      0.55       412
weighted avg       0.56      0.56      0.56       412



# Model 2

In [10]:
xgb = XGBClassifier()
xgb_preds, xgb_oof = get_kfold_preds(xgb)

--------------------------------------------------
Fold 1
Training Accuracy :  1.0
Validation Accuracy :  0.47619047619047616
--------------------------------------------------
Fold 2
Training Accuracy :  1.0
Validation Accuracy :  0.40476190476190477
--------------------------------------------------
Fold 3
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366
--------------------------------------------------
Fold 4
Training Accuracy :  1.0
Validation Accuracy :  0.6341463414634146
--------------------------------------------------
Fold 5
Training Accuracy :  1.0
Validation Accuracy :  0.5609756097560976
--------------------------------------------------
Fold 6
Training Accuracy :  1.0
Validation Accuracy :  0.6341463414634146
--------------------------------------------------
Fold 7
Training Accuracy :  1.0
Validation Accuracy :  0.5609756097560976
--------------------------------------------------
Fold 8
Training Accuracy :  1.0
Validation Accuracy :  0.585365853658536

In [11]:
print(classification_report(train['Score'].values, np.argmax(xgb_oof, axis = -1), target_names = ['Home Win', 'Away win', 'Draw']))

              precision    recall  f1-score   support

    Home Win       0.52      0.57      0.54       164
    Away win       0.51      0.42      0.46       113
        Draw       0.69      0.70      0.70       135

    accuracy                           0.57       412
   macro avg       0.57      0.56      0.57       412
weighted avg       0.57      0.57      0.57       412



# Model 3

In [12]:
catboost = CatBoostClassifier()
cat_preds, cat_oof = get_kfold_preds(catboost)

--------------------------------------------------
Fold 1
Training Accuracy :  0.9945945945945946
Validation Accuracy :  0.47619047619047616
--------------------------------------------------
Fold 2
Training Accuracy :  0.9135135135135135
Validation Accuracy :  0.5476190476190477
--------------------------------------------------
Fold 3
Training Accuracy :  1.0
Validation Accuracy :  0.5609756097560976
--------------------------------------------------
Fold 4
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366
--------------------------------------------------
Fold 5
Training Accuracy :  0.9326145552560647
Validation Accuracy :  0.5365853658536586
--------------------------------------------------
Fold 6
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366
--------------------------------------------------
Fold 7
Training Accuracy :  0.9730458221024259
Validation Accuracy :  0.4878048780487805
--------------------------------------------------
Fold 8
Traini

In [13]:
print(classification_report(train['Score'].values, np.argmax(cat_oof, axis = -1), target_names = ['Home Win', 'Away win', 'Draw']))

              precision    recall  f1-score   support

    Home Win       0.52      0.66      0.58       164
    Away win       0.42      0.29      0.34       113
        Draw       0.69      0.64      0.67       135

    accuracy                           0.55       412
   macro avg       0.54      0.53      0.53       412
weighted avg       0.55      0.55      0.54       412



# Model 4

In [14]:
lda = LinearDiscriminantAnalysis()
lda_preds, lda_oof = get_kfold_preds(lda)

--------------------------------------------------
Fold 1
Training Accuracy :  0.518918918918919
Validation Accuracy :  0.42857142857142855
--------------------------------------------------
Fold 2
Training Accuracy :  0.5378378378378378
Validation Accuracy :  0.35714285714285715
--------------------------------------------------
Fold 3
Training Accuracy :  0.5148247978436657
Validation Accuracy :  0.36585365853658536
--------------------------------------------------
Fold 4
Training Accuracy :  0.48787061994609165
Validation Accuracy :  0.43902439024390244
--------------------------------------------------
Fold 5
Training Accuracy :  0.5094339622641509
Validation Accuracy :  0.4146341463414634
--------------------------------------------------
Fold 6
Training Accuracy :  0.522911051212938
Validation Accuracy :  0.4146341463414634
--------------------------------------------------
Fold 7
Training Accuracy :  0.5121293800539084
Validation Accuracy :  0.43902439024390244
----------------

In [15]:
print(classification_report(train['Score'].values, np.argmax(lda_oof, axis = -1), target_names = ['Home Win', 'Away win', 'Draw']))

              precision    recall  f1-score   support

    Home Win       0.46      0.58      0.51       164
    Away win       0.26      0.20      0.23       113
        Draw       0.43      0.38      0.40       135

    accuracy                           0.41       412
   macro avg       0.39      0.39      0.38       412
weighted avg       0.40      0.41      0.40       412



# Model 5

In [16]:
rf = RandomForestClassifier(random_state = 42)
rf_preds, rf_oof = get_kfold_preds(rf)

--------------------------------------------------
Fold 1
Training Accuracy :  1.0
Validation Accuracy :  0.5238095238095238
--------------------------------------------------
Fold 2
Training Accuracy :  1.0
Validation Accuracy :  0.5
--------------------------------------------------
Fold 3
Training Accuracy :  1.0
Validation Accuracy :  0.5853658536585366
--------------------------------------------------
Fold 4
Training Accuracy :  1.0
Validation Accuracy :  0.4878048780487805
--------------------------------------------------
Fold 5
Training Accuracy :  1.0
Validation Accuracy :  0.5121951219512195
--------------------------------------------------
Fold 6
Training Accuracy :  1.0
Validation Accuracy :  0.5365853658536586
--------------------------------------------------
Fold 7
Training Accuracy :  1.0
Validation Accuracy :  0.4878048780487805
--------------------------------------------------
Fold 8
Training Accuracy :  1.0
Validation Accuracy :  0.4878048780487805
---------------

In [17]:
print(classification_report(train['Score'].values, np.argmax(rf_oof, axis = -1), target_names = ['Home Win', 'Away win', 'Draw']))

              precision    recall  f1-score   support

    Home Win       0.49      0.71      0.58       164
    Away win       0.44      0.30      0.36       113
        Draw       0.74      0.53      0.62       135

    accuracy                           0.54       412
   macro avg       0.56      0.52      0.52       412
weighted avg       0.56      0.54      0.53       412



In [18]:
print("Ensemble OOF :", accuracy_score(train['Score'], np.argmax(0.2*lgbm_oof + 0.2*xgb_oof + 0.2*cat_oof + 0.2*lda_oof + 0.2*rf_oof, axis = -1)))

Ensemble OOF : 0.5655339805825242


# Predictions

In [19]:
inverse_score_map = {0: 'Home Win', 1: 'Away win', 2: 'Draw'}
test['Score'] = np.argmax(0.2*lgbm_preds + 0.2*xgb_preds + 0.2*cat_preds + 0.2*lda_preds + 0.2*rf_preds, axis = -1)
test['Score'] = test['Score'].map(inverse_score_map)
test['Game_ID'] = game_le.inverse_transform(test['Game_ID'].values)
sub = test[['Game_ID', 'Score']].copy()
sub = sub.drop_duplicates(subset = 'Game_ID').reset_index(drop = True)
sub.to_csv("submission.csv", index = False)
sub.head()

Unnamed: 0,Game_ID,Score
0,ID_8518U587,Home Win
1,ID_H49BIKG7,Home Win
2,ID_PO6SP4VA,Draw
3,ID_MZRCNBAQ,Home Win
4,ID_CV9VOLIU,Home Win


In [20]:
sub['Score'].value_counts()

Home Win    94
Draw        86
Away win    54
Name: Score, dtype: int64

In [21]:
print("Train Score Distribution")
print(train['Score'].map(inverse_score_map).value_counts(normalize = True))
print("Test Score Distribution")
print(sub['Score'].value_counts(normalize = True))

Train Score Distribution
Home Win    0.398058
Draw        0.327670
Away win    0.274272
Name: Score, dtype: float64
Test Score Distribution
Home Win    0.401709
Draw        0.367521
Away win    0.230769
Name: Score, dtype: float64
