In [64]:
import pickle
import xgboost as xgb

import numpy as np
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston
%matplotlib inline
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

In [3]:
rng = np.random.RandomState(31337)

In [21]:
print("Zeros and Ones from the Digits dataset: binary classification")
digits = load_digits(2)
y = digits['target']
X = digits['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

Zeros and Ones from the Digits dataset: binary classification
[[91  0]
 [ 1 88]]
[[87  0]
 [ 1 92]]


In [20]:
print("Iris: multiclass classification")
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index],eval_set=[(X[train_index], 
         y[train_index]),(X[test_index], y[test_index])],verbose=10)
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

Iris: multiclass classification
[[27  0  0]
 [ 0 24  3]
 [ 0  0 21]]
[[23  0  0]
 [ 0 23  0]
 [ 0  2 27]]


[0]	validation_0-merror:0.013333	validation_1-merror:0.093333
[10]	validation_0-merror:0.000000	validation_1-merror:0.040000
[20]	validation_0-merror:0.000000	validation_1-merror:0.040000
[30]	validation_0-merror:0.000000	validation_1-merror:0.040000
[40]	validation_0-merror:0.000000	validation_1-merror:0.040000
[50]	validation_0-merror:0.000000	validation_1-merror:0.040000
[60]	validation_0-merror:0.000000	validation_1-merror:0.040000
[70]	validation_0-merror:0.000000	validation_1-merror:0.040000
[80]	validation_0-merror:0.000000	validation_1-merror:0.040000
[90]	validation_0-merror:0.000000	validation_1-merror:0.040000
[99]	validation_0-merror:0.000000	validation_1-merror:0.040000
[0]	validation_0-merror:0.040000	validation_1-merror:0.053333
[10]	validation_0-merror:0.026667	validation_1-merror:0.026667
[20]	validation_0-merror:0.026667	validation_1-merror:0.026667
[30]	validation_0-merror:0.013333	validation_1-merror:0.026667
[40]	validation_0-merror:0.013333	validation_1-merror:0.0

In [11]:
xgb.XGBRegressor()

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [4]:
print("Boston Housing: regression")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index],eval_set=[(X[train_index], 
         y[train_index]),(X[test_index], y[test_index])],verbose=10)
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print( np.sqrt(mean_squared_error(actuals, predictions)))

print("Parameter optimization")
#y = boston['target']
#X = boston['data']
#xgb_model = xgb.XGBRegressor()
#clf = GridSearchCV(xgb_model,
#                   {'max_depth': [2,4,6]}, verbose=1)
#clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)

Boston Housing: regression
4.71669334375
3.14875578541
Parameter optimization


[0]	validation_0-rmse:21.081469	validation_1-rmse:22.169455
[10]	validation_0-rmse:8.069112	validation_1-rmse:9.272619
[20]	validation_0-rmse:3.629229	validation_1-rmse:5.862863
[30]	validation_0-rmse:2.258594	validation_1-rmse:5.128814
[40]	validation_0-rmse:1.797065	validation_1-rmse:4.912023
[50]	validation_0-rmse:1.615941	validation_1-rmse:4.861473
[60]	validation_0-rmse:1.490326	validation_1-rmse:4.806803
[70]	validation_0-rmse:1.406585	validation_1-rmse:4.782348
[80]	validation_0-rmse:1.334839	validation_1-rmse:4.761765
[90]	validation_0-rmse:1.264957	validation_1-rmse:4.743332
[99]	validation_0-rmse:1.193217	validation_1-rmse:4.716693
[0]	validation_0-rmse:22.158699	validation_1-rmse:21.052099
[10]	validation_0-rmse:8.646101	validation_1-rmse:8.326352
[20]	validation_0-rmse:3.988879	validation_1-rmse:4.418694
[30]	validation_0-rmse:2.482219	validation_1-rmse:3.507356
[40]	validation_0-rmse:1.967204	validation_1-rmse:3.288592
[50]	validation_0-rmse:1.741761	validation_1-rmse:3.21

NameError: name 'clf' is not defined

In [63]:
print("Pickling sklearn API models")
# must open in binary format to pickle
pickle.dump(xgb_model, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
print(np.allclose(xgb_model.predict(X_train), clf2.predict(X_train)))

Pickling sklearn API models
True


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)


In [None]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 

cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}

optimized_GBM.grid_scores_

In [None]:
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1} 
# Grid Search CV optimized settings

cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error

In [91]:
import numpy as np
from sklearn.cross_validation import PredefinedSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
ps = PredefinedSplit(test_fold=[-1, -1, -1, 0,0])
#ps = PredefinedSplit(test_fold=[0, 1, 2, 1])
len(ps)

1

In [111]:
for train_index, test_index in ps:
    print("TRAIN:", train_index, "TEST:", test_index)

('TRAIN:', array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 1

In [4]:
boston = datasets.load_boston()
y = boston['target']
X = boston['data']
err = []
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    err.append(mean_squared_error(actuals, predictions))
    
    xgb.plot.importance

NameError: name 'KFold' is not defined

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=test_size, random_state=seed)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
from sklearn.grid_search import GridSearchCV
   xgb_model = xgb.XGBRegressor()
   clf = GridSearchCV(xgb_model, {'max_depth': [2,4, 6],
       'n_estimators': [100, 200, 250]}, verbose=1, n_jobs=2)
   clf.fit(X, y)

   print(clf.best_score_)
   print(clf.best_params_)

In [None]:
xgr = xgb.XGBRegressor(n_estimators=g['ne'], max_depth=g['md'], 
    seed=g['rs'], missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, objective='reg:linear')

In [None]:
clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 5,
                n_estimators=1000,
                learning_rate=0.1, 
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 3,
                scale_pos_weight = ratio,
                reg_alpha=0.03,
                seed=1301)
                
clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="auc",
        eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [66]:
import sys
print(sys.executable)

//anaconda/bin/python


In [34]:

y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s


0.598487920717
{'n_estimators': 100, 'max_depth': 4}


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.9s finished


In [37]:
predictions = clf.predict(X_test)
print( np.sqrt(mean_squared_error(y_test, predictions)))

1.27293653096
