### Relevant imports/variables.
These are mostly straightforward.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [None]:
# Some global constants.
NUM_PREDICTOR_COLS = 784
PREDICTOR_COLS = ['pixel' + str(i) for i in range(NUM_PREDICTOR_COLS)]

### Relevant helper routines
These are self explanatory.

In [None]:
def create_pipeline_and_out_of_sample_score(given_model, 
                                            X_train, 
                                            X_test, 
                                            Y_train, 
                                            Y_test):
    # Rewrite everything as a pipeline
    my_pipeline = make_pipeline(given_model)
    my_pipeline.fit(X_train, Y_train.values.ravel())
    predictions = my_pipeline.predict(X_test)
    out_of_sample_score = accuracy_score(predictions, Y_test)
    return (my_pipeline, out_of_sample_score, predictions)

def cross_validate(my_pipeline, X, Y):
    cross_val_scores = \
        cross_val_score(my_pipeline, X, Y, scoring='accuracy', cv=5)

    return cross_val_scores.mean()

def train_test_cross_validate(train_data,
                              given_model,
                              do_cross_validation=True,
                              X_columns=PREDICTOR_COLS, 
                              Y_columns=['label']):
    (X_train, X_test, Y_train, Y_test, X, Y) = \
        get_train_test_data(train_data, X_columns, Y_columns)
    my_pipeline, out_of_sample_score, predictions_test = \
        create_pipeline_and_out_of_sample_score(given_model, X_train, X_test, Y_train, Y_test)

    predictions_train = my_pipeline.predict(X_train)
    num_correct_predictions_train = int((accuracy_score(predictions_train, Y_train)) * len(Y_train))
    num_correct_predictions_train1 = np.sum(predictions_train == Y_train.values.ravel())
    print('Training score is {0}'.format((accuracy_score(predictions_train, Y_train)) ))
    if do_cross_validation:
        cross_validation_score = cross_validate(make_pipeline(given_model), X, Y.values.ravel())
    else:
        cross_validation_score = -1

    return (my_pipeline, out_of_sample_score, cross_validation_score)

def get_train_test_data(train_data,
                        X_columns=PREDICTOR_COLS, 
                        Y_columns=['label']):
    # Simple training and testing
    X = train_data[X_columns]
    Y = train_data[Y_columns]
 
    # Do imputation on relevant columns.
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
    return (X_train, X_test, Y_train, Y_test, X, Y)



### Reading in data !

In [None]:
full_data = pd.read_csv('../input/train.csv')

### Checking for null values.

In [None]:
full_data.isnull().values.any()

### Split into training and validation sets

In [None]:
LEN_TRAIN_SET = int(0.8 * len(full_data))
train_data = full_data[0:LEN_TRAIN_SET]
validation_data = full_data[LEN_TRAIN_SET:len(full_data)]

In [None]:
assert(len(train_data) + len(validation_data) == len(full_data))

### Set up for doing cross validation
Later , we try several versions playing with the n_estimators parameter.

In [None]:
# Commenting out, as we have another implemention later.
#(my_pipeline, out_of_sample_score, cross_validation_score) = \
#    train_test_cross_validate(full_data,
#                              XGBClassifier(seed=1),
#                              do_cross_validation=False,
#                              X_columns=PREDICTOR_COLS, 
#                              Y_columns=['label'])    
#print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



#### Using xgboost exact mode.
Till now, we we have used the sklearn wrapper for xgboost. However, it is recommended to use  the exact mode as that gives more flexibility.


In [None]:
import xgboost as xgb
train_label = train_data[['label']]
validation_label = validation_data[['label']]
dtrain = xgb.DMatrix(train_data[PREDICTOR_COLS], label=train_label)
dvalid = xgb.DMatrix(validation_data[PREDICTOR_COLS], label=validation_label)

watchlist = [(dvalid, 'dvalidation'), (dtrain, 'training')]


### Section on cross validation/out of sample testing.

This has been commented out to reduce the running time of the script.

In [None]:
#eta_num_rounds = [(0.3, 50), (0.2, 100), (0.1, 150), [0.05, 200], [0.02, 300]]


#for elem in eta_num_rounds :
#    eta = elem[0]
#    num_boost_rounds = elem[1]
#    param = {'eta' : eta, 'objective' : 'multi:softmax', 'num_class' : 10}
#    res = xgb.cv(param, dtrain, num_boost_round=num_boost_rounds, metrics={'merror'}, early_stopping_rounds=5 )
#    print(res[['train-merror-mean', 'train-merror-std']])
#    print(res[['test-merror-mean', 'test-merror-std']])
 #   print(eta, num_boost_rounds)

In [None]:
#for elem in eta_num_rounds :
#    eta = elem[0]
#    num_boost_rounds = elem[1]
#    param = {'eta' : eta, 'objective' : 'multi:softmax', 'num_class' : 10}
#    bst = xgb.train(param, dtrain, num_boost_round=num_boost_rounds, evals=watchlist, early_stopping_rounds=5 )
#    print(eta, num_boost_rounds)


In [None]:
eta = 0.2
num_boost_rounds = 100
param = {'eta' : eta, 'objective' : 'multi:softmax', 'num_class' : 10}
bst = xgb.train(param, dtrain, num_boost_round=num_boost_rounds, evals=watchlist, early_stopping_rounds=5 )
print(eta, num_boost_rounds)

predictions_valid = bst.predict(dvalid)
(predictions_valid != validation_data['label'].values).sum()/len(predictions_valid)

### Making predictions on  test data.

In [None]:
test_data = pd.read_csv('../input/test.csv')
test_predictions = my_pipeline.predict(test_data[PREDICTOR_COLS])
test_data['label'] = test_predictions
test_data['ImageId'] = test_data.index + 1
test_data[['ImageId', 'label']].to_csv('submission_boosting_trees_xgboost.csv', index=False)