# Extremme Gradient Boosting

## 1. Prepare the data

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, f1_score, confusion_matrix, classification_report

In [37]:
df_football = pd.read_pickle('data/df_data_cleaned.csv')
df_football

Unnamed: 0,Name,Age,Overall,Potential,Club,Value,Wage,Special,Preferred Foot,International Reputation,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,L. Messi,31.0,94.0,94.0,FC Barcelona,1.105e+08,5.65e+07,2202.0,0.0,5.0,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0
1,Cristiano Ronaldo,33.0,94.0,94.0,Juventus,7.7e+07,4.05e+07,2228.0,1.0,5.0,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0
2,Neymar Jr,26.0,92.0,93.0,Paris Saint-Germain,1.185e+08,2.9e+07,2143.0,1.0,5.0,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0
3,K. De Bruyne,27.0,91.0,92.0,Manchester City,1.02e+08,3.55e+07,2281.0,1.0,4.0,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0
4,E. Hazard,27.0,91.0,91.0,Chelsea,9.3e+07,3.4e+07,2142.0,1.0,4.0,...,91.0,34.0,27.0,22.0,11.0,12.0,6.0,8.0,8.0,172100000.0
5,L. Modrić,32.0,91.0,91.0,Real Madrid,6.7e+07,4.2e+07,2280.0,1.0,4.0,...,84.0,60.0,76.0,73.0,13.0,9.0,7.0,14.0,9.0,137400000.0
6,L. Suárez,31.0,91.0,91.0,FC Barcelona,8e+07,4.55e+07,2346.0,1.0,5.0,...,85.0,62.0,45.0,38.0,27.0,25.0,31.0,33.0,37.0,164000000.0
7,Sergio Ramos,32.0,91.0,91.0,Real Madrid,5.1e+07,3.8e+07,2201.0,1.0,4.0,...,82.0,87.0,92.0,91.0,11.0,8.0,9.0,7.0,11.0,104600000.0
8,R. Lewandowski,29.0,90.0,90.0,FC Bayern München,7.7e+07,2.05e+07,2152.0,1.0,4.0,...,86.0,34.0,42.0,19.0,15.0,6.0,12.0,8.0,10.0,127100000.0
9,T. Kroos,28.0,90.0,90.0,Real Madrid,7.65e+07,3.55e+07,2190.0,1.0,4.0,...,85.0,72.0,79.0,69.0,10.0,11.0,13.0,7.0,10.0,156800000.0


In [54]:
train, test = train_test_split(df_football, train_size=0.75, test_size=0.25)
X_train = train.drop(columns=['Name', 'Club', 'Position', 'Preferred Foot', 'Value', 'Wage'])
y_train = train[['Value']]
X_test = test.drop(columns=['Name', 'Club', 'Position', 'Preferred Foot', 'Value', 'Wage'])
y_test = test[['Value']]

In [55]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_test = X_test.astype(float)
y_test = y_test.astype(float)

In [56]:
xgb_reg = XGBRegressor(random_state=2019, n_jobs=-1)

xgb_reg.fit(X_train, y_train)
predictions = xgb_reg.predict(X_test)
print('MAE in train:', mean_absolute_error(xgb_reg.predict(X_train), y_train))
print('MSE in train:', np.sqrt(mean_squared_error(xgb_reg.predict(X_train), y_train)))
print('MAE in test:', mean_absolute_error(xgb_reg.predict(X_test), y_test))
print('RMSE in test:', np.sqrt(mean_squared_error(xgb_reg.predict(X_test), y_test)))

MAE in train: 5606779.643521921
MSE in train: 11672110.91480139
MAE in test: 6089030.551304768
RMSE in test: 12418610.580174146


In [60]:
xgb_reg2 = XGBRegressor(n_estimators=400, 
                                random_state=2019, n_jobs=-1) #set n_jobs in -2 if your computer is not powerfull

xgb_reg2.fit(X_train, y_train)
predictions = xgb_reg2.predict(X_test)
print('MAE in train:', mean_absolute_error(xgb_reg2.predict(X_train), y_train))
print('MSE in train:', np.sqrt(mean_squared_error(xgb_reg2.predict(X_train), y_train)))
print('MAE in test:', mean_absolute_error(xgb_reg2.predict(X_test), y_test))
print('RMSE in test:', np.sqrt(mean_squared_error(xgb_reg2.predict(X_test), y_test)))

MAE in train: 4582143.341163924
MSE in train: 9602935.770158896
MAE in test: 5568568.676277479
RMSE in test: 11625082.52375289


In [58]:
XGBRegressor?

### Parameters
* learning rate
* gamma
* min_child_weight
* max_delta_step
* subsample
* colsample_bytree
* colsample_bylevel
* colsample_bynode
* reg_alpha
* reg_lambda
* scale_pos_weights
* base_score
* missing
* importance_type