In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('./data/train_features_ready.csv')
df_test = pd.read_csv('./data/test_features_ready.csv')

In [3]:
df_train.shape, df_test.shape

((213451, 317), (62096, 317))

In [5]:
from sklearn.preprocessing import LabelEncoder
labels = df_train['country_destination']
le = LabelEncoder()
y = le.fit_transform(labels)
X = df_train.drop('country_destination', axis=1, inplace=False)

In [6]:
idtrain = df_train['id']
X = X.drop('id', axis=1, inplace=False)

In [7]:
X.shape, y.shape

((213451, 315), (213451,))

In [8]:
# Grid Search - Used to find best combination of parameters
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

In [10]:
XGB_model = xgb.XGBClassifier(objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
param_grid = {'max_depth': [3], 'learning_rate': [0.1, 0.2, 0.3], 'n_estimators': [25]}
model = GridSearchCV(estimator=XGB_model, param_grid=param_grid, scoring='accuracy', verbose=10, n_jobs=1, iid=True, refit=True, cv=3)

In [11]:
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.176434 - 5.2min
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.568931 - 4.5min

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  5.2min
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  9.7min



[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.585444 - 4.3min
[CV] n_estimators=25, learning_rate=0.2, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.2, max_depth=3, score=0.093263 - 5.0min
[CV] n_estimators=25, learning_rate=0.2, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.2, max_depth=3, score=0.549915 - 5.2min
[CV] n_estimators=25, learning_rate=0.2, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.2, max_depth=3, score=0.586091 - 5.0min
[CV] n_estimators=25, learning_rate=0.3, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.3, max_depth=3, score=0.060741 - 5.6min
[CV] n_estimators=25, learning_rate=0.3, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.3, max_depth=3, score=0.427696 - 7.3min

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed: 24.1min
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed: 42.1min



[CV] n_estimators=25, learning_rate=0.3, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.3, max_depth=3, score=0.585992 - 6.4min
Best score: 0.444
Best parameters set:
	learning_rate: 0.1
	max_depth: 3
	n_estimators: 25


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 48.5min finished


In [12]:
param_grid = {'max_depth': [3,4,6], 'learning_rate': [0.1], 'n_estimators': [25]}
model = GridSearchCV(estimator=XGB_model, param_grid=param_grid, scoring='accuracy', verbose=10, n_jobs=1, iid=True, refit=True, cv=3)
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.176434 - 4.8min
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.568931 - 4.2min

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  4.8min
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  9.1min



[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.585444 -17.0min
[CV] n_estimators=25, learning_rate=0.1, max_depth=4 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=4, score=0.106361 - 7.0min
[CV] n_estimators=25, learning_rate=0.1, max_depth=4 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=4, score=0.225731 - 7.7min
[CV] n_estimators=25, learning_rate=0.1, max_depth=4 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=4, score=0.585318 - 5.5min
[CV] n_estimators=25, learning_rate=0.1, max_depth=6 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=6, score=0.054403 -10.0min
[CV] n_estimators=25, learning_rate=0.1, max_depth=6 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=6, score=0.180503 - 9.8min

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed: 40.7min
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed: 66.1min



[CV] n_estimators=25, learning_rate=0.1, max_depth=6 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=6, score=0.585247 - 9.2min
Best score: 0.444
Best parameters set:
	learning_rate: 0.1
	max_depth: 3
	n_estimators: 25


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 75.3min finished


In [13]:
param_grid = {'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [25,50,75]}
model = GridSearchCV(estimator=XGB_model, param_grid=param_grid, scoring='accuracy', verbose=10, n_jobs=1, iid=True, refit=True, cv=3)
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.176434 - 4.1min
[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.568931 - 4.0min

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  4.1min
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  8.2min



[CV] n_estimators=25, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=25, learning_rate=0.1, max_depth=3, score=0.585444 -13.0min
[CV] n_estimators=50, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=50, learning_rate=0.1, max_depth=3, score=0.091435 - 8.7min
[CV] n_estimators=50, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=50, learning_rate=0.1, max_depth=3, score=0.555621 - 8.4min
[CV] n_estimators=50, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=50, learning_rate=0.1, max_depth=3, score=0.586231 - 7.8min
[CV] n_estimators=75, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=75, learning_rate=0.1, max_depth=3, score=0.061149 -12.8min
[CV] n_estimators=75, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=75, learning_rate=0.1, max_depth=3, score=0.520597 -13.2min

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed: 38.2min
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed: 72.0min



[CV] n_estimators=75, learning_rate=0.1, max_depth=3 .................
[CV]  n_estimators=75, learning_rate=0.1, max_depth=3, score=0.586273 -12.7min
Best score: 0.444
Best parameters set:
	learning_rate: 0.1
	max_depth: 3
	n_estimators: 25


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 84.7min finished


In [15]:
'''Best parameters set:
	learning_rate: 0.1
	max_depth: 3
	n_estimators: 25
'''

'Best parameters set:\n\tlearning_rate: 0.1\n\tmax_depth: 3\n\tn_estimators: 25\n'