## Module installation & setup 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install catboost -q

Mounted at /content/drive
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
[?25h

## Load modules and utility functions

In [None]:
#Import some libraries
import time
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

le = LabelEncoder()
seed = 21

In [None]:
def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type not in  ['object', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
            # pass
    
    return df



# function to drop some columns from the game statsics and load the files 
def preprocess_all(path, Goals=True):
    '''
        path: the original data path
        Goals: bool if True the Goal_scored and Goals_coceded columns will be dropped

    '''

    # read the original data
    trgs = pd.read_csv(path+'train_game_statistics.csv')
    tsgs = pd.read_csv(path+'test_game_statistics.csv')

    cols = ["next_action"]
    if Goals:
        cols.extend(["Goals_scored", "Goals_conceded"])

    # drop the columns that are found only in train
    trgs.drop(
        columns=cols,
        inplace=True
    )


    nx_cols = ['next_player',
                'next_x',
                'next_y',
                'next_team',
                'next_event_id',
                'event_id',
                'xt_value'
    ]
    # since this columns as almost null to train let's drop them
    trgs.drop(columns=nx_cols, inplace=True)
    tsgs.drop(columns=nx_cols, inplace=True)

    trgs = reduce_memory_usage(trgs)
    tsgs = reduce_memory_usage(tsgs)


    return trgs, tsgs 

## Load data 

In [None]:
#import dat
path = "/content/drive/MyDrive/Landuma/"


train = pd.read_csv(path+"Train_modified.csv")
test = pd.read_csv(path+"Test_modified.csv")

train_game_statistics, test_game_statistics =  preprocess_all(path, False)

## EDA, Preprocessing and Feature engineering

In [None]:
training = train[(train['Season'] == 1) ]
validation = train[(train['Season'] == 2) ]
training.shape, validation.shape

((207, 14), (205, 14))

In [None]:
training_IDS = training['Game_ID'].values
validation_IDS = validation['Game_ID'].values

traingamestats = train_game_statistics.loc[train_game_statistics['Game_ID'].isin(training_IDS)]
valgamestats = train_game_statistics.loc[train_game_statistics['Game_ID'].isin(validation_IDS)]
train_game_statistics.shape, traingamestats.shape,valgamestats.shape

((1571577, 20), (807108, 20), (764469, 20))

In [None]:
traingamestats = traingamestats[(traingamestats['Season'] == 1) ]
valgamestats = valgamestats[(valgamestats['Season'] == 2) ]
traingamestats.shape, valgamestats.shape

((807108, 20), (764469, 20))

In [None]:
drop_cols = ['Manager','Player_ID','id']
test_game_statistics = test_game_statistics.drop(columns = drop_cols)
test_game_statistics.shape

(780234, 14)

In [None]:
missing_cols = ['Shots',
       'SoT', 'Accurate passes', 'Inaccurate passes', 'Passes',
       'Start_minutes']
for col in test_game_statistics.columns:
    if col in missing_cols:
        test_game_statistics[col] = test_game_statistics[col].fillna(test_game_statistics[col].mean())

In [None]:
train = pd.concat((training, validation)).reset_index(drop=True)
traingamestats = pd.concat((traingamestats, valgamestats)).reset_index(drop=True)
train.shape, traingamestats.shape

((412, 14), (1571577, 20))

In [None]:
traingamestats['sumXY'] = traingamestats['X'] + traingamestats['Y']
traingamestats['diffEndStart'] = traingamestats['End_minutes'] - traingamestats['Start_minutes']

test_game_statistics['sumXY'] = test_game_statistics['X'] + test_game_statistics['Y']
test_game_statistics['diffEndStart'] = test_game_statistics['End_minutes'] - test_game_statistics['Start_minutes']

In [None]:
train["Train"] = 1
test["Train"]= 0

train_test = pd.concat([train, test])
old_data = train_test.copy()
game_statistics = pd.concat([traingamestats, test_game_statistics])
all_data = game_statistics.merge(train_test, on="Game_ID")

In [None]:
from tqdm import tqdm

unique_id = all_data.Game_ID.unique()
fcols = ["Shots", "SoT", "Accurate passes", "Inaccurate passes", "Passes"]

Total_data = {}
for _id in tqdm(unique_id):
  currid_dict = all_data[all_data.Game_ID == _id]
  team_name_map = {currid_dict['Home Team'].values[0]:"Home ", currid_dict['Away Team'].values[0]:"Away "}
  dict_main = {}
  for half in ["1st half", "2nd half", ""]:
    _currid_dict = currid_dict
    if half:
      _currid_dict = _currid_dict[_currid_dict.Half == half]
      half += " "
    for tm in team_name_map:
      dict_sum = _currid_dict[_currid_dict.Team==tm][fcols].sum().to_dict()
      dict_sum = {"Total " + i.lower():j for i, j in dict_sum.items()}
      try:
        dict_sum["pass acc"] = dict_sum['Total accurate passes'] / dict_sum["Total passes"]
      except:
        dict_sum["pass acc"] = 0
      dict_main.update({half+team_name_map[tm]+i:j for i,j in dict_sum.items()})
  
  dict_main["Diff total shot"] = dict_main["Home Total shots"] - dict_main["Away Total shots"]
  dict_main["Diff total sot"] = dict_main["Home Total sot"] - dict_main["Away Total sot"]

  dict_main["2nd half Diff Total shots"] = dict_main["2nd half Home Total shots"] - dict_main["2nd half Away Total shots"]
  dict_main["2nd half Diff total sot"] = dict_main["2nd half Home Total sot"] - dict_main["2nd half Away Total sot"]

  dict_main["1st half Diff Total shots"] = dict_main["1st half Home Total shots"] - dict_main["1st half Away Total shots"]
  dict_main["1st half Diff total sot"] = dict_main["1st half Home Total sot"] - dict_main["1st half Away Total sot"]
  
  Total_data[_id] = dict_main
  

100%|██████████| 646/646 [02:06<00:00,  5.11it/s]


In [None]:
all_data["Game_ID"] = le.fit_transform(all_data["Game_ID"])
game_mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [None]:
all_data["Team"] = le.fit_transform(all_data["Team"])

mapping = dict(zip(le.classes_, range(len(le.classes_))))
mapping

{'Andromeda': 0,
 'Antennae': 1,
 'Backward': 2,
 'Butterfly': 3,
 'Cartwheel': 4,
 'Cigar': 5,
 'Circinus': 6,
 'Coma Pinwheel': 7,
 'Comet': 8,
 'Cosmos Redshift 7': 9,
 'Eye of Sauron': 10,
 'Fireworks': 11,
 'Malin 1': 12,
 'Medusa Merger': 13,
 'Milky Way': 14,
 'Needle': 15,
 'Sculptor': 16,
 'Sombrero': 17,
 'Sunflower': 18,
 'Tadpole': 19,
 'Triangulum': 20,
 'Whirlpool': 21}

In [None]:
all_data["Away Team"] = all_data["Away Team"].apply(lambda x: mapping[x])
all_data["Home Team"] = all_data["Home Team"].apply(lambda x: mapping[x])


all_data["Opposition_Team"] = all_data["Opposition_Team"].apply(lambda x: mapping[x])


traingamestats["Game_ID"] = traingamestats["Game_ID"].apply(lambda x: game_mapping[x])
test_game_statistics["Game_ID"] = test_game_statistics["Game_ID"].apply(lambda x: game_mapping[x])
old_data["Game_ID"] = old_data["Game_ID"].apply(lambda x: game_mapping[x])

In [None]:
# data from the aggregated features
D = {i:[] for i in list(Total_data.values())[0].keys()}
D["Game_ID"] = []
for ID, val in Total_data.items():
  D["Game_ID"].append(game_mapping[ID])
  for col in val:
    D[col].append(val[col])
    
_data = pd.DataFrame(D)
_data.head()

Unnamed: 0,1st half Home Total shots,1st half Home Total sot,1st half Home Total accurate passes,1st half Home Total inaccurate passes,1st half Home Total passes,1st half Home pass acc,1st half Away Total shots,1st half Away Total sot,1st half Away Total accurate passes,1st half Away Total inaccurate passes,...,Away Total inaccurate passes,Away Total passes,Away pass acc,Diff total shot,Diff total sot,2nd half Diff Total shots,2nd half Diff total sot,1st half Diff Total shots,1st half Diff total sot,Game_ID
0,5.0,2.0,171.0,50.0,221.0,0.773756,3.0,0.0,188.0,48.0,...,99.0,463.0,0.786177,5.0,0.0,3.0,-2.0,2.0,2.0,643
1,3.0,2.0,180.0,51.0,231.0,0.779221,7.0,3.0,128.0,51.0,...,97.0,340.0,0.714706,-3.0,1.0,1.0,2.0,-4.0,-1.0,641
2,7.0,2.0,184.0,41.0,225.0,0.817778,5.0,0.0,149.0,38.0,...,89.0,389.0,0.771208,4.0,6.0,2.0,4.0,2.0,2.0,636
3,6.0,3.0,167.0,45.0,212.0,0.787736,5.0,0.0,153.0,38.0,...,79.0,427.0,0.814988,-2.0,1.0,-3.0,-2.0,1.0,3.0,635
4,3.0,0.0,124.0,34.0,158.0,0.78481,9.0,2.0,257.0,42.0,...,78.0,556.0,0.859712,-11.0,-3.0,-5.0,-1.0,-6.0,-2.0,633


In [None]:
# add the created features
data = old_data.merge(_data, on="Game_ID")
data.head()

Unnamed: 0,Date,Season,Match_ID,Game_ID,Home Team,Away Team,Score,Home_goal_prob,Home_goal_conced_prob,Away_goal_prob,...,Away Total accurate passes,Away Total inaccurate passes,Away Total passes,Away pass acc,Diff total shot,Diff total sot,2nd half Diff Total shots,2nd half Diff total sot,1st half Diff Total shots,1st half Diff total sot
0,2017-02-15,1,1.0,353,Antennae,Andromeda,Away win,0.974833,1.775748,1.562658,...,433.0,75.0,508.0,0.852362,3.0,1.0,2.0,1.0,1.0,0.0
1,2016-09-14,1,12.0,519,Andromeda,Antennae,Draw,0.815817,0.896178,0.753394,...,386.0,76.0,462.0,0.835498,-5.0,1.0,-5.0,1.0,0.0,0.0
2,2016-12-12,1,13.0,415,Andromeda,Butterfly,Away win,0.009791,0.891645,0.734295,...,279.0,109.0,388.0,0.719072,-3.0,-2.0,-3.0,-1.0,0.0,-1.0
3,2017-01-25,1,14.0,527,Andromeda,Cigar,Away win,0.01011,0.89484,0.713836,...,353.0,93.0,446.0,0.79148,-6.0,-1.0,-4.0,0.0,-2.0,-1.0
4,2016-08-24,1,15.0,250,Andromeda,Circinus,Home Win,1.730743,0.99005,0.824104,...,452.0,114.0,566.0,0.798587,8.0,2.0,4.0,2.0,4.0,0.0


In [None]:
train = data[data["Train"] == 1].drop(columns="Train")
test = data[data["Train"] == 0].drop(columns=["Train", "Score"])

In [None]:
traingamestatgroups = traingamestats.groupby('Game_ID').agg(

             game_shots_mean = ("Shots", "mean"),
             
             game_shots_sum = ("Shots", "sum"),
             game_shots_skew = ("Shots", "skew"),
             game_shots_std = ("Shots", "std"),
 
             game_SoT_max = ("SoT", "max"),
             game_SoT_mean = ("SoT", "mean"),
             
             game_SoT_sum = ("SoT", "sum"),
             game_SoT_skew = ("SoT", "skew"),
             game_SoT_std = ("SoT", "std"),

             game_accuratepasses_mean = ("Accurate passes", "mean"),
             
             game_accuratepasses_sum = ("Accurate passes", "sum"),
             game_accuratepasses_skew = ("Accurate passes", "skew"),
             game_accuratepasses_std = ("Accurate passes", "std"),
    
             game_inaccuratepasses_mean = ("Inaccurate passes", "mean"),
             
             game_inaccuratepasses_sum = ("Inaccurate passes", "sum"),
             game_inaccuratepasses_skew = ("Inaccurate passes", "skew"),
             game_inaccuratepasses_std = ("Inaccurate passes", "std"), 


             game_Goals_scored_sum = ("Goals_scored", "mean"),
             game_Goals_conceded_sum = ("Goals_conceded", "mean"),
    

             game_passes_mean = ("Passes", "mean"),
             
             game_passes_sum = ("Passes", "sum"),
             game_passes_skew = ("Passes", "skew"),
             game_passes_std = ("Passes", "std"),
    
             game_sumXY_min = ("sumXY", "min"),
             game_sumXY_max = ("sumXY", "max"),
             game_sumXY_mean = ("sumXY", "mean"),
             
             game_sumXY_sum = ("sumXY", "sum"),
             game_sumXY_skew = ("sumXY", "skew"),
             game_sumXY_std = ("sumXY", "std"),


    
             game_diffEndStart_min = ("diffEndStart", "min"),
             game_diffEndStart_max = ("diffEndStart", "max"),
             game_diffEndStart_mean = ("diffEndStart", "mean"),
             
             game_diffEndStart_sum = ("diffEndStart", "sum"),
             game_diffEndStart_skew = ("diffEndStart", "skew"),
             game_diffEndStart_std = ("diffEndStart", "std"),    
             
             ).reset_index()

In [None]:
testgamestatgroups = test_game_statistics.groupby('Game_ID').agg(


             game_shots_mean = ("Shots", "mean"),
             
             game_shots_sum = ("Shots", "sum"),
             game_shots_skew = ("Shots", "skew"),
             game_shots_std = ("Shots", "std"),
    
             game_SoT_max = ("SoT", "max"),
             game_SoT_mean = ("SoT", "mean"),
             
             game_SoT_sum = ("SoT", "sum"),
             game_SoT_skew = ("SoT", "skew"),
             game_SoT_std = ("SoT", "std"),
    
             game_accuratepasses_mean = ("Accurate passes", "mean"),
             
             game_accuratepasses_sum = ("Accurate passes", "sum"),
             game_accuratepasses_skew = ("Accurate passes", "skew"),
             game_accuratepasses_std = ("Accurate passes", "std"),
    

             game_inaccuratepasses_mean = ("Inaccurate passes", "mean"),
             
             game_inaccuratepasses_sum = ("Inaccurate passes", "sum"),
             game_inaccuratepasses_skew = ("Inaccurate passes", "skew"),
             game_inaccuratepasses_std = ("Inaccurate passes", "std"), 


             game_passes_mean = ("Passes", "mean"),
             
             game_passes_sum = ("Passes", "sum"),
             game_passes_skew = ("Passes", "skew"),
             game_passes_std = ("Passes", "std"),
    
             game_sumXY_min = ("sumXY", "min"),
             game_sumXY_max = ("sumXY", "max"),
             game_sumXY_mean = ("sumXY", "mean"),
             
             game_sumXY_sum = ("sumXY", "sum"),
             game_sumXY_skew = ("sumXY", "skew"),
             game_sumXY_std = ("sumXY", "std"),


    
             game_diffEndStart_min = ("diffEndStart", "min"),
             game_diffEndStart_max = ("diffEndStart", "max"),
             game_diffEndStart_mean = ("diffEndStart", "mean"),
             
             game_diffEndStart_sum = ("diffEndStart", "sum"),
             game_diffEndStart_skew = ("diffEndStart", "skew"),
             game_diffEndStart_std = ("diffEndStart", "std"),    
               
 
             ).reset_index()

In [None]:
train  = pd.merge(train ,traingamestatgroups,on=['Game_ID'],how ='left')
train.shape

(412, 91)

In [None]:
print(testgamestatgroups.shape)
test  = pd.merge(test,testgamestatgroups,on=['Game_ID'],how ='left')
test.shape

(234, 34)


(234, 88)

Now the strength Features 
* we going to use the aggregated features game_Goals_scored_sum and game_Goals_conceded_sum then drop these features 
* So the point of this is to create season 1 and season 2 stregnths(combined) then use them as a rollback feature in season 3 to look at past attacking and defensive strength

In [None]:
# team , Home Goals Score, Away Goals Scored,Home Attacking Strength, Away attacking Strength, Home Goals Conceded
# Away goals conceded Home defensive strength Away defensive strength

table = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))

#Remember our statistics is based on the home team so away_scored is what home has conceded 
avg_home_scored = train.game_Goals_scored_sum.sum() / 480
avg_away_scored = train.game_Goals_conceded_sum.sum() / 480

#makes sense now :)
avg_home_conceded = avg_away_scored
avg_away_conceded = avg_home_scored
#print
#Here we groupby the home team to get home team strengths
res_home = train.groupby('Home Team')
#print()
#Here we group by the away team to get away team strengths
res_away = train.groupby('Away Team')
all_teams_list = list(res_home.groups.keys())

#I hope this below makes sense
print(all_teams_list)
table.Team = list(res_home.groups.keys())
table.HGS = res_home.game_Goals_scored_sum.sum().values
table.HGC = res_home.game_Goals_conceded_sum.sum().values
table.AGS = res_away.game_Goals_conceded_sum.sum().values
table.AGC = res_away.game_Goals_scored_sum.sum().values

#15 Home matches for each team each season
table.HAS = (table.HGS / 30.0) / avg_home_scored
table.AAS = (table.AGS / 30.0) / avg_away_scored
table.HDS = (table.HGC / 30.0) / avg_home_conceded
table.ADS = (table.AGC / 30.0) / avg_away_conceded
table

['Andromeda', 'Antennae', 'Backward', 'Butterfly', 'Cartwheel', 'Cigar', 'Circinus', 'Coma Pinwheel', 'Comet', 'Cosmos Redshift 7', 'Eye of Sauron', 'Fireworks', 'Medusa Merger', 'Milky Way', 'Sculptor', 'Sombrero', 'Sunflower', 'Tadpole', 'Triangulum']


Unnamed: 0,Team,HGS,AGS,HAS,AAS,HGC,AGC,HDS,ADS
0,Andromeda,0.013344,0.011696,0.906738,0.794434,0.013344,0.011696,0.906738,0.794434
1,Antennae,0.013458,0.014,0.914551,0.950684,0.013458,0.014,0.914551,0.950684
2,Backward,0.013527,0.011032,0.918945,0.749512,0.013527,0.011032,0.918945,0.749512
3,Butterfly,0.008003,0.004639,0.543945,0.315186,0.008003,0.004639,0.543945,0.315186
4,Cartwheel,0.019745,0.017441,1.34082,1.18457,0.019745,0.017441,1.34082,1.18457
5,Cigar,0.018738,0.018372,1.273438,1.248047,0.018738,0.018372,1.273438,1.248047
6,Circinus,0.008293,0.014175,0.562988,0.962891,0.008293,0.014175,0.562988,0.962891
7,Coma Pinwheel,0.013275,0.01619,0.901855,1.099609,0.013275,0.01619,0.901855,1.099609
8,Comet,0.016357,0.013443,1.110352,0.913086,0.016357,0.013443,1.110352,0.913086
9,Cosmos Redshift 7,0.015213,0.015152,1.033203,1.029297,0.015213,0.015152,1.033203,1.029297


Append  the combined strengths

In [None]:
#Home Attacking Strength(HAS), Home Defensive Strength(HDS), Away Attacking Strength(AAS), Away Defensive Strength(ADS)

def get_strength(df):
    f_HAS = []
    f_HDS = []
    f_AAS = []
    f_ADS = []

    for index, row in df.iterrows():
        try:
            f_HAS.append(table[table['Team'] == row['Home Team']]['HAS'].values[0])
            f_HDS.append(table[table['Team'] == row['Home Team']]['HDS'].values[0])
        except:
            f_HAS.append(np.nan)
            f_HDS.append(np.nan)
        try:
            f_AAS.append(table[table['Team'] == row['Away Team']]['AAS'].values[0])
            f_ADS.append(table[table['Team'] == row['Away Team']]['ADS'].values[0])
        except:
            f_AAS.append(np.nan)
            f_ADS.append(np.nan)


    return f_HAS, f_HDS, f_AAS, f_ADS

f_HAS, f_HDS, f_AAS, f_ADS = get_strength(train)
train['combinedseasons_HAS'] = f_HAS
train['combinedseasons_HDS'] = f_HDS
train['combinedseasons_AAS'] = f_AAS
train['combinedseasons_ADS'] = f_ADS


f_HAS, f_HDS, f_AAS, f_ADS = get_strength(test)
test['combinedseasons_HAS'] = f_HAS
test['combinedseasons_HDS'] = f_HDS
test['combinedseasons_AAS'] = f_AAS
test['combinedseasons_ADS'] = f_ADS

train.head()


Unnamed: 0,Date,Season,Match_ID,Game_ID,Home Team,Away Team,Score,Home_goal_prob,Home_goal_conced_prob,Away_goal_prob,...,game_diffEndStart_min,game_diffEndStart_max,game_diffEndStart_mean,game_diffEndStart_sum,game_diffEndStart_skew,game_diffEndStart_std,combinedseasons_HAS,combinedseasons_HDS,combinedseasons_AAS,combinedseasons_ADS
0,2017-02-15,1,1.0,353,Antennae,Andromeda,Away win,0.974833,1.775748,1.562658,...,0.3125,0.875,0.334229,1245.0,9.734375,0.029595,0.914551,0.914551,0.794434,0.794434
1,2016-09-14,1,12.0,519,Andromeda,Antennae,Draw,0.815817,0.896178,0.753394,...,0.25,0.84375,0.333252,1399.0,7.5625,0.025187,0.906738,0.906738,0.950684,0.950684
2,2016-12-12,1,13.0,415,Andromeda,Butterfly,Away win,0.009791,0.891645,0.734295,...,0.21875,0.84375,0.333008,1429.0,5.738281,0.022543,0.906738,0.906738,0.315186,0.315186
3,2017-01-25,1,14.0,527,Andromeda,Cigar,Away win,0.01011,0.89484,0.713836,...,0.3125,0.875,0.333496,1260.0,7.5,0.022836,0.906738,0.906738,1.248047,1.248047
4,2016-08-24,1,15.0,250,Andromeda,Circinus,Home Win,1.730743,0.99005,0.824104,...,0.219971,0.84375,0.333496,1474.0,9.273438,0.028409,0.906738,0.906738,0.962891,0.962891


Remember we have to drop the goals grouped features because test doesn't have them

In [None]:
drop_cols = ['game_Goals_scored_sum', 'game_Goals_conceded_sum']
train = train.drop(columns=drop_cols)
test.shape,training.shape

((234, 92), (207, 14))

Date features

In [None]:
train = train.sort_values(by=['Date']).reset_index(drop=True)
test = test.sort_values(by=['Date']).reset_index(drop=True)
test_df = test.copy()

In [None]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# train["Year"] = train.Date.dt.year
train["Month"] = train.Date.dt.month
train["Week"] = train.Date.dt.week

# test["Year"] = test.Date.dt.year
test["Month"] = test.Date.dt.month
test["Week"] = test.Date.dt.week

In [None]:
drop_cols = ['Date','Season','Game_ID']
le_columns = ['Home Team','Away Team','Score']

test = test.drop(columns=drop_cols)
train = train.drop(columns=drop_cols)
test.shape,train.shape

((234, 91), (412, 92))

Label Encoding

In [None]:
for col in le_columns:
   train[col] = le.fit_transform(train[col])

#### Home WIn = 2, Draw = 1 and Away Win = 0
* Necessary for submission

In [None]:
le_columns = ['Home Team','Away Team']
for col in le_columns:
   test[col] = le.fit_transform(test[col])

#### okay missing values impute them with the mean


In [None]:
missing_cols = ['game_shots_skew','game_SoT_skew','game_accuratepasses_skew','game_inaccuratepasses_skew','game_passes_skew','game_sumXY_skew','game_diffEndStart_skew']
for col in test.columns:
    if col in missing_cols:
        test[col] = test[col].fillna(test[col].mean())

In [None]:
for col in train.columns[train.isna().sum() > 0]:
    train[col] = train[col].fillna(-1)
for col in test.columns[test.isna().sum() > 0]:
    test[col] = test[col].fillna(-1)

## Modeling

In [None]:
features = train.columns.difference([
    'diff_goal_mean', 'home_goal_mean', 'away_goal_mean',
    'Month', 'Week', 'Match_ID', 'Score',

    'game_diffEndStart_min', 'game_diffEndStart_max',
    'game_diffEndStart_mean', 'game_diffEndStart_sum',
    'game_diffEndStart_skew', 'game_diffEndStart_std',
    
    'game_sumXY_min', 'game_sumXY_max', 'game_sumXY_mean',
    'game_sumXY_sum', 'game_sumXY_skew','game_sumXY_std',

    'game_shots_mean', 'game_shots_sum', 'game_shots_skew',
    'game_shots_std', 'game_SoT_max', 'game_SoT_mean',
    'game_SoT_sum', 'game_SoT_skew',

    'game_accuratepasses_mean', 'game_accuratepasses_sum',
    'game_accuratepasses_skew', 'game_accuratepasses_std',
    'game_inaccuratepasses_mean', 'game_inaccuratepasses_sum',
    'game_inaccuratepasses_skew', 'game_inaccuratepasses_std',
    'game_passes_mean', 'game_passes_sum', 'game_passes_skew',

])

X = train[features]
y =  train['Score']
Test = test[features]
X.shape,Test.shape

((412, 57), (234, 57))

In [None]:
def apply_gmean(results):
    R = np.ones(results[0].shape)
    for r in results:
        R *= r
    ln = len(results)
    
    return R ** (1/ln)
    


class My_model:
    def __init__(self, models):
        self.models = models

    def fit_eval_pred(self, X, y, eval_set, val, test, verbose=True):
        results_eval = []
        results_test = []
        for i, model in enumerate(self.models):
            if verbose:
                print(f"[Training]............................... Model_{i+1}")
            st = time.time()
            try:
                model.fit(X, y, eval_set=eval_set, verbose=False)
            except Exception as e:
                model.fit(X.fillna(-1), y)
            if verbose:
                print(f"[Prediction]............................. Model_{i+1}")
            p = model.predict_proba(val)
            results_eval.append(p)
            results_test.append(model.predict_proba(test))
            if verbose:
                print(f"Log loss = {log_loss(eval_set[0][1], p)}", end=" "*6)
                print(f"Time {time.time() - st :.2f}sec")

        
        results_eval = apply_gmean(results_eval)
        results_test = apply_gmean(results_test)

        return results_eval, results_test


In [None]:
def get_models(seed=seed):

    cat_params = {"iterations":10000, "learning_rate":0.01, "verbose":100, "random_state":seed,
            'use_best_model':True, 'early_stopping_rounds':100, "task_type":"CPU"}

    lgb_params = {'learning_rate':0.01, 'n_estimators':10000, 'random_state':seed, 
            'early_stopping_round':100, 'verbose':-1, 'subsample':0.4, 'colsample_bytree':0.3}



    _models = [
            CatBoostClassifier(**cat_params),   
            LGBMClassifier(**lgb_params),

    ]
    return _models


In [None]:
results = []
ns = 10
tot = []
models = []

skf = StratifiedKFold(n_splits=ns, random_state=21, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(100*"-")
    print(f"Fold-{fold+1}")
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
   


    main_model = My_model(get_models())
    models.append(main_model)
    pred, T_pred = main_model.fit_eval_pred(X_train, y_train, [(X_test, y_test)], X_test, Test)

    loss = log_loss(y_test, pred)
    print(f"log loss = {loss}")
    tot += [loss]

    results.append(T_pred)

    print(100*"-")
    print("\n\n")

print(f"Average log loss = {sum(tot) / len(tot)}")

----------------------------------------------------------------------------------------------------
Fold-1
[Training]............................... Model_1
[Prediction]............................. Model_1
Log loss = 0.0019232188082860871      Time 166.22sec
[Training]............................... Model_2
[Prediction]............................. Model_2
Log loss = 0.000804917645988004      Time 2.53sec
log loss = 0.00046451379754487563
----------------------------------------------------------------------------------------------------



----------------------------------------------------------------------------------------------------
Fold-2
[Training]............................... Model_1
[Prediction]............................. Model_1
Log loss = 0.009803962926089848      Time 165.37sec
[Training]............................... Model_2
[Prediction]............................. Model_2
Log loss = 0.006311723825261602      Time 2.65sec
log loss = 0.0074164010968836135
--------

In [None]:
# Average log loss = 0.04252654387520471
spred = np.mean(results, axis=0)

In [None]:
preds_df = pd.DataFrame(spred, columns=['Away win','Draw','Home Win'])
preds_df = pd.concat(objs=[test_df['Game_ID'], preds_df], axis="columns")
_game_mapping = {value:key for key, value in game_mapping.items()}
preds_df["Game_ID"] = preds_df["Game_ID"].apply(lambda x: _game_mapping[int(x)])

In [None]:
preds_df.head()

Unnamed: 0,Game_ID,Away win,Draw,Home Win
0,ID_PYMMJPRC,0.001091,0.001431,0.996739
1,ID_G91KCS98,0.001287,0.002143,0.996305
2,ID_3I1STYJX,0.005088,0.920403,0.052438
3,ID_V8L2BWZV,0.997266,0.001345,0.001159
4,ID_Q0QMD9X9,0.001337,0.996529,0.001626


In [None]:
preds_df.to_csv('submission.csv', index=False)