This notebook is part of the [Machine Learning class](https://github.com/erachelson/MLclass) by [Emmanuel Rachelson](https://personnel.isae-supaero.fr/emmanuel-rachelson?lang=en) and was written by Erwan Lecarpentier and Jonathan Sprauel.

License: CC-BY-SA-NC.

<div style="font-size:22pt; line-height:25pt; font-weight:bold; text-align:center;">XGBoost<br>Introduction to XGBoost</div>

This Practice Course is composed of 3 parts - each part is meant to be done in about 1 hour :
* In the **first notebook**, you will learn the **basic of XGBoost**, how to apply it on a dataset and tune it to obtain the best performances.
* In the **second notebook**, we will focus on **ensemble methods** and explain what makes XGBoost different from other models.
* Finally in the **last notebook** you will see how the choice of a method (such as XGBoost) is a key element of a tradeoff between **Bias and Variance**. 

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set_style('whitegrid')
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline


from xgboost import XGBClassifier
import xgboost as xgb

In [69]:
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


param = {
    'max_depth': 3,  
    'eta': 0.3, 
    'objective': 'multi:softprob',
    'num_class': 3}
num_round = 20 


bst = xgb.train(param, dtrain, num_round)

bst.dump_model('dump.raw.txt') # dump for model explanation

with open('dump.raw.txt', 'r') as file:
    data = file.read()
    print (data)

booster[0]:
0:[f2<2.45000005] yes=1,no=2,missing=1
	1:leaf=0.426035523
	2:leaf=-0.218845025
booster[1]:
0:[f2<2.45000005] yes=1,no=2,missing=1
	1:leaf=-0.213017777
	2:[f3<1.75] yes=3,no=4,missing=3
		3:[f2<4.94999981] yes=5,no=6,missing=5
			5:leaf=0.409090936
			6:leaf=-9.7534878e-009
		4:[f2<4.85000038] yes=7,no=8,missing=7
			7:leaf=-7.66345476e-009
			8:leaf=-0.210218996
booster[2]:
0:[f2<4.75] yes=1,no=2,missing=1
	1:[f3<1.45000005] yes=3,no=4,missing=3
		3:leaf=-0.217894763
		4:[f0<5.75] yes=7,no=8,missing=7
			7:leaf=-7.66345476e-009
			8:leaf=-0.155172437
	2:[f3<1.75] yes=5,no=6,missing=5
		5:[f2<5.05000019] yes=9,no=10,missing=9
			9:leaf=-0.0360000096
			10:leaf=0.179999992
		6:[f2<4.85000038] yes=11,no=12,missing=11
			11:leaf=0.128571421
			12:leaf=0.420437962
booster[3]:
0:[f2<2.45000005] yes=1,no=2,missing=1
	1:leaf=0.293278694
	2:leaf=-0.195823714
booster[4]:
0:[f2<2.45000005] yes=1,no=2,missing=1
	1:leaf=-0.189503655
	2:[f3<1.75] yes=3,no=4,missing=3
		3:[f2<4.94999981]

In [70]:

ypred1 = bst.predict(dtest, ntree_limit=1)
# by default, we predict using all the trees
ypred2 = bst.predict(dtest)
print('error of ypred1 = %f' % (np.sum((ypred1 > 0.5) != y_test) / float(len(y_test))))
print('error of ypred2 = %f' % (np.sum((ypred2 > 0.5) != y_test) / float(len(y_test))))

error of ypred1 = 0.033333
error of ypred2 = 0.033333
