In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import seaborn as sns
import numpy as np
import pandas as pd

from fifa.dataset import FifaDataset
from logger import initialize_logger

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [3]:
initialize_logger()

In [4]:
dataset = FifaDataset(
    train_path = './dataset/fifa2021_training.csv',
    test_path  = './dataset/fifa2021_test.csv'
) 
X, y = dataset.raw_train_features_target()

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13921 entries, 0 to 13920
Data columns (total 41 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Penalties       13921 non-null  int64   
 1   Finishing       13921 non-null  int64   
 2   LongPass        13921 non-null  int64   
 3   GKDiving        13921 non-null  int64   
 4   PlayerWorkRate  13921 non-null  category
 5   Dribbling       13921 non-null  int64   
 6   SkillMoves      13921 non-null  int64   
 7   Agility         13921 non-null  int64   
 8   Vision          13921 non-null  int64   
 9   SlideTackle     13921 non-null  int64   
 10  GKHandling      13921 non-null  int64   
 11  Potential       13921 non-null  int64   
 12  Marking         13921 non-null  int64   
 13  SprintSpeed     13921 non-null  int64   
 14  BallControl     13921 non-null  int64   
 15  ShortPass       13921 non-null  int64   
 16  Jumping         13921 non-null  int64   
 17  Stamina     

In [6]:
y.head()

0    MID
1    MID
2    DEF
3    DEF
4    MID
Name: Position, dtype: object

In [7]:
def target_to_num(target, mapping = {'DEF': 0, 'FWD': 1, 'GK': 2, 'MID': 3}):
    return target.apply(lambda x: mapping[x])

def num_to_target(target, mapping = ['DEF', 'FWD', 'GK', 'MID']):
    return [mapping[x] for x in target]

num_target = target_to_num(y)
num_target

0        3
1        3
2        0
3        0
4        3
        ..
13916    3
13917    3
13918    3
13919    0
13920    1
Name: Position, Length: 13921, dtype: int64

In [45]:
params = {
    'learning_rate': 0.001,
    'max_depth'    : 5,
    'n_jobs'       : 24,    
    'boosting_type': 'gbdt', #GradientBoostingDecisionTree
    'objective'    : 'multiclass', #Multi-class target feature
    'metric'       : 'multi_logloss', #metric for multi-class
    'num_class'    : 4, #no.of unique values in the target class not inclusive of the end value
    'verbose'      : -1,
    'is_unbalance' : True,
    'lambda_l1'    : 1,
    'device': 'gpu',
    'gpu_platform_id': 0
    'gpu_device_id': 0
}

In [131]:
def train_cv(
    X,
    y,
    params,
    num_boost_round = 15000,
    nfold           = 10,
    stratified      = True,
    callbacks       = [
        lgb.early_stopping(10, verbose=0), 
        lgb.log_evaluation(period=500)
    ]
):
    train_set = lgb.Dataset(X, label=y)
    return lgb.cv(
        params,
        train_set,
        num_boost_round  = num_boost_round,
        nfold            = nfold,
        callbacks        = callbacks,
        return_cvbooster = True
    )

In [132]:
cv_result = train_cv(X, num_target, params)

[500]	cv_agg's multi_logloss: 0.723654 + 0.00659229
[1000]	cv_agg's multi_logloss: 0.500021 + 0.0102551
[1500]	cv_agg's multi_logloss: 0.391609 + 0.0123618
[2000]	cv_agg's multi_logloss: 0.333397 + 0.0134992
[2500]	cv_agg's multi_logloss: 0.299713 + 0.0139518
[3000]	cv_agg's multi_logloss: 0.279523 + 0.0142608
[3500]	cv_agg's multi_logloss: 0.266733 + 0.0145741
[4000]	cv_agg's multi_logloss: 0.258495 + 0.0149091
[4500]	cv_agg's multi_logloss: 0.252659 + 0.015112
[5000]	cv_agg's multi_logloss: 0.248355 + 0.0152158
[5500]	cv_agg's multi_logloss: 0.245106 + 0.0153641
[6000]	cv_agg's multi_logloss: 0.242687 + 0.0154429
[6500]	cv_agg's multi_logloss: 0.240753 + 0.0155285
[7000]	cv_agg's multi_logloss: 0.239302 + 0.0155626
[7500]	cv_agg's multi_logloss: 0.238228 + 0.0156141
[8000]	cv_agg's multi_logloss: 0.237377 + 0.0156015
[8500]	cv_agg's multi_logloss: 0.236653 + 0.0157014
[9000]	cv_agg's multi_logloss: 0.236014 + 0.0158203
[9500]	cv_agg's multi_logloss: 0.235584 + 0.0157858
[10000]	cv_ag

In [139]:
model = cv_result['cvbooster']

In [140]:
booster = model.boosters[-1]

In [141]:
X_test = dataset.test_set()

In [142]:
y_pred = booster.predict(X_test)
y_pred = np.array(y_pred)

In [144]:
prediction = [np.argmax(line) for line in y_pred]
# prediction

In [39]:
def train_and_pred(
    X_train,
    y_train,
    X_test,  
    params,
    num_boost_round = 10000,
    callbacks       = [lgb.early_stopping(10, verbose=1)]
):
    train_set = lgb.Dataset(X_train, label=y_train)

    model = lgb.train(
        params,
        train_set,
        num_boost_round = num_boost_round
    )
    pred = model.predict(X_test)
    return [np.argmax(line) for line in pred]

y_pred = train_and_pred(X_train, y_train, X_test, params)

In [145]:
mapping = ['DEF', 'FWD', 'GK', 'MID']

test_data = pd.DataFrame(data={
    'ID': dataset.raw_test_set().ID.values,
    'Category': num_to_target(prediction)
})

filename = "{}-predict-{:%Y-%m-%d_%H-%M-%S}.csv".format('result-x', datetime.now())

test_data.to_csv(filename, index=False)