In [4]:
%matplotlib inline

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
from xgboost import plot_importance
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from datetime import date,datetime
from sklearn.ensemble import RandomForestRegressor

In [6]:
traindata=pd.read_csv('../dataset/fifa2018/train.csv')
testdata=pd.read_csv('../dataset/fifa2018/test.csv')

In [7]:
traindata.values[:,:-1]

array([[0, 293, 25, ..., 53.0, 56.0, nan],
       [1, 258, 24, ..., 61.0, 64.0, nan],
       [2, 112, 3, ..., 31.0, 36.0, nan],
       ...,
       [10438, 626, 26, ..., 41.0, 46.0, nan],
       [10439, 147, 9, ..., 49.0, 50.0, nan],
       [10440, 234, 18, ..., 68.0, 63.0, nan]], dtype=object)

In [3]:
traindata[['rw','rb','st']].hist()

NameError: name 'traindata' is not defined

In [4]:
today = datetime(2018, 4, 15)
traindata['birth_date'] = pd.to_datetime(traindata['birth_date'])
today-traindata['birth_date']

NameError: name 'datetime' is not defined

In [5]:
traindata.info()

NameError: name 'traindata' is not defined

In [410]:
class InputData():
    def __init__(self, train_data_path, test_data_path, split_ratio=0.2):
        self._train_data_path = train_data_path
        self._test_data_path = test_data_path
        self._split_ratio = split_ratio
        np.random.seed(42)
        self._train_data = None
        self._eval_data = None
        self._test_data = None

    def _add_features(self, data):
        today = datetime(2018, 4, 15)

        data['birth_date'] = pd.to_datetime(data['birth_date'])
        data['age'] = (today - data['birth_date']).apply(lambda x: x.days) / 365.
        data['BMI'] = 10000. * data['weight_kg'] / (data['height_cm'] ** 2)
        data['is_gk'] = data['gk'] > 0

        positions = ['rw', 'rb', 'st', 'lw', 'cf', 'cam', 'cm', 'cdm', 'cb', 'lb', 'gk']

        data['best_pos'] = data[positions].max(axis=1)
        data['best_pos'] = data[positions].max(axis=1)

        return data

    def _split_data(self, data):
        if self._split_ratio <= 0:
            return data, None
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * self._split_ratio)

        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return data.iloc[train_indices], data.iloc[test_indices]

    def _load_train_data(self):
        df = pd.read_csv(self._train_data_path)
        data = self._add_features(df)
        self._train_data, self._eval_data = self._split_data(data)

    def _load_test_data(self):
        df = pd.read_csv(self._test_data_path)
        self._test_data = self._add_features(df)

    def train_data(self, is_gk):
        if self._train_data is not None:
            return self._train_data[self._train_data['is_gk'] == is_gk]
        self._load_train_data()

        return self._train_data[self._train_data['is_gk'] == is_gk]

    def eval_data(self, is_gk):
        if self._eval_data is not None:
            return self._eval_data[self._eval_data['is_gk'] == is_gk]

        self._load_train_data()
        return self._eval_data[self._eval_data['is_gk'] == is_gk]

    def test_data(self, is_gk):
        if self._test_data is not None:
            return self._test_data[self._test_data['is_gk'] == is_gk]
        self._load_test_data()
        return self._test_data[self._test_data['is_gk'] == is_gk]

    def all_test_data(self):
        if self._test_data is not None:
            return self._test_data
        self._load_test_data()
        return self._test_data

    def has_eval(self):
        return self._eval_data is not None


class Model():
    def __init__(self, model_builder, input_data: InputData, is_gk, features=None):
        self.model = model_builder()
        self.features = features
        self.input_data = input_data
        self.is_gk = is_gk

    def fit(self):
        train_data = self.input_data.train_data(self.is_gk)
        self.model.fit(train_data[self.features], train_data['y'])

    def eval(self):
        if self.input_data.has_eval():
            eval_data = self.input_data.eval_data(self.is_gk)
        else:
            eval_data = self.input_data.eval_data(self.is_gk)
        if eval_data is None:
            return None, None
        return self.model.predict(eval_data[self.features]), eval_data

    def test(self):
        test_data = self.input_data.test_data(self.is_gk)
        if test_data is None:
            return None, None
        return self.model.predict(test_data[self.features]), test_data


class SplitGKModel():
    def __init__(self, model_builder, input_data: InputData, gk_features=None, not_gk_features=None):
        self.gk_model = Model(model_builder, input_data, True, gk_features)
        self.not_gk_model = Model(model_builder, input_data, False, not_gk_features)
        self.input_data = input_data

    def fit(self):
        self.gk_model.fit()
        self.not_gk_model.fit()

    def evaluate(self):
        prediction1, eval_data1 = self.gk_model.eval()
        prediction2, eval_data2 = self.not_gk_model.eval()
        sum = np.abs(eval_data1['y'] - prediction1).sum() + np.abs(eval_data2['y'] - prediction2).sum()
        return sum / (len(eval_data1['y']) + len(eval_data2['y'])), prediction1, prediction2

    def test(self):
        prediction1, eval_data1 = self.gk_model.test()
        prediction2, eval_data2 = self.not_gk_model.test()
        return prediction1, prediction2

In [411]:
class Trainer():
    def __init__(self, model_builder, gk_features=None, not_gk_features=None, eval_ratio=0.2):
        self.train_file_path = '../dataset/fifa2018/train.csv'
        self.test_file_path = '../dataset/fifa2018/test.csv'
        self.submit_file_path = '../dataset/fifa2018/sample_submit.csv'
        self.prediction_file_path = '../dataset/fifa2018/prediction.csv'
        if not_gk_features is None:
            not_gk_features = gk_features

        self.input_data = InputData(self.train_file_path, self.test_file_path, eval_ratio)
        self.model = SplitGKModel(model_builder, self.input_data, gk_features, not_gk_features)

    def train(self):
        self.model.fit()
        result, gk_preds, not_gk_preds = self.model.evaluate()
        return result, gk_preds, not_gk_preds

    def train_and_test(self):
        self.model.fit()
        gk_preds, not_gk_preds = self.model.test()
        test = self.input_data.all_test_data()
        submit = pd.read_csv(self.submit_file_path)
        test.loc[test['is_gk'] == True, 'pred'] = gk_preds
        test.loc[test['is_gk'] == False, 'pred'] = not_gk_preds

        submit['y'] = np.array(test['pred'])
        submit.to_csv(self.prediction_file_path, index=False)


class RunnerBase():

    def _build(self):
        return None

    def _trainer(self, gk_features, not_gk_features=None, eval_ratio=0.2):
        return Trainer(self._build, gk_features, not_gk_features, eval_ratio)

    def train(self, gk_features, not_gk_features=None):
        trainer = self._trainer(gk_features, not_gk_features)
        print(trainer.train()[0])

    def train_and_test(self, gk_features, not_gk_features=None):
        self._trainer(gk_features, not_gk_features, 0).train_and_test()


class XGBoostRunner(RunnerBase):
    def __init__(self, max_depth=8):
        self.max_depth = max_depth

    def _build(self):
        return xgb.XGBRegressor(max_depth=self.max_depth, learning_rate=0.1, n_estimators=160, silent=False,
                                objective='reg:gamma')

In [412]:
features=['height_cm', 'weight_kg', 'potential', 'BMI', 'pac',
                           'phy', 'international_reputation', 'age', 'best_pos']
def train1():
    XGBoostRunner().train(features)
def train_and_test1():
    XGBoostRunner().train_and_test(features)

In [413]:
inputdata=InputData(train_data_path='../dataset/fifa2018/train.csv',test_data_path='../dataset/fifa2018/test.csv')

In [382]:
features=['height_cm', 'weight_kg', 'potential', 'BMI', 'pac',
                           'phy', 'international_reputation', 'age', 'best_pos']
inputdata.train_data(False).head(5)

Unnamed: 0,id,club,league,birth_date,height_cm,weight_kg,nationality,potential,pac,sho,...,cm,cdm,cb,lb,gk,y,age,BMI,is_gk,best_pos
6935,6935,308,28,1992-03-13,174,71,5,64,77,58,...,59.0,52.0,48.0,52.0,,26.0,26.106849,23.450918,False,65.0
3660,3660,369,26,1984-02-29,191,90,64,66,54,39,...,52.0,65.0,69.0,65.0,,26.0,34.147945,24.670376,False,69.0
6520,6520,511,39,1991-04-29,177,70,49,72,71,61,...,71.0,69.0,65.0,67.0,,330.0,26.980822,22.343516,False,71.0
7995,7995,93,6,1995-10-21,180,76,111,71,65,49,...,57.0,58.0,55.0,56.0,,20.0,22.49863,23.45679,False,58.0
8126,8126,141,20,1991-12-30,187,85,110,76,77,70,...,54.0,43.0,42.0,45.0,,320.0,26.309589,24.307244,False,70.0


In [414]:
train_and_test1()

In [122]:
imputer = SimpleImputer()
imputer.fit(traindata.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])
x_new = imputer.transform(traindata.loc[:, ['rw', 'st', 'lw', 'cf', 'cam', 'cm']])

In [128]:
SimpleImputer?

[0;31mInit signature:[0m [0mSimpleImputer[0m[0;34m([0m[0mmissing_values[0m[0;34m=[0m[0mnan[0m[0;34m,[0m [0mstrategy[0m[0;34m=[0m[0;34m'mean'[0m[0;34m,[0m [0mfill_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0madd_indicator[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Imputation transformer for completing missing values.

Read more in the :ref:`User Guide <impute>`.

Parameters
----------
missing_values : number, string, np.nan (default) or None
    The placeholder for the missing values. All occurrences of
    `missing_values` will be imputed.

strategy : string, optional (default="mean")
    The imputation strategy.

    - If "mean", then replace missing values using the mean along
      each column. Can only be used with numeric data.
    - If "median", then replace missing values using the median alon

In [74]:
np.abs(a-b).sum()/len(a)

1.6666666666666667

[0;31mType:[0m            ndarray
[0;31mString form:[0m     [1 4 5]
[0;31mLength:[0m          3
[0;31mFile:[0m            ~/.pyenv/versions/anaconda3-5.3.0/envs/pytorch/lib/python3.6/site-packages/numpy/__init__.py
[0;31mDocstring:[0m       <no docstring>
[0;31mClass docstring:[0m
ndarray(shape, dtype=float, buffer=None, offset=0,
        strides=None, order=None)

An array object represents a multidimensional, homogeneous array
of fixed-size items.  An associated data-type object describes the
format of each element in the array (its byte-order, how many bytes it
occupies in memory, whether it is an integer, a floating point number,
or something else, etc.)

Arrays should be constructed using `array`, `zeros` or `empty` (refer
to the See Also section below).  The parameters given here refer to
a low-level method (`ndarray(...)`) for instantiating an array.

For more information, refer to the `numpy` module and examine the
methods and attributes of an array.

Parameters
---