### Relevant imports/variables.
These are mostly straightforward.

In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [81]:
# Some global constants.
NUM_PREDICTOR_COLS = 784
PREDICTOR_COLS = ['pixel' + str(i) for i in range(NUM_PREDICTOR_COLS)]

### Relevant helper routines
These are self explanatory.

In [82]:
def create_pipeline_and_out_of_sample_score(given_model, 
                                            X_train, 
                                            X_test, 
                                            Y_train, 
                                            Y_test):
    # Rewrite everything as a pipeline
    my_pipeline = make_pipeline(given_model)
    my_pipeline.fit(X_train, Y_train.values.ravel())
    predictions = my_pipeline.predict(X_test)
    out_of_sample_score = accuracy_score(predictions, Y_test)
    return (my_pipeline, out_of_sample_score, predictions)

def cross_validate(my_pipeline, X, Y):
    cross_val_scores = \
        cross_val_score(my_pipeline, X, Y, scoring='accuracy', cv=5)

    return cross_val_scores.mean()

def train_test_cross_validate(train_data,
                              given_model,
                              do_cross_validation=True,
                              X_columns=PREDICTOR_COLS, 
                              Y_columns=['label']):
    (X_train, X_test, Y_train, Y_test, X, Y) = \
        get_train_test_data(train_data, X_columns, Y_columns)
    my_pipeline, out_of_sample_score, predictions_test = \
        create_pipeline_and_out_of_sample_score(given_model, X_train, X_test, Y_train, Y_test)

    predictions_train = my_pipeline.predict(X_train)
    num_correct_predictions_train = int((accuracy_score(predictions_train, Y_train)) * len(Y_train))
    num_correct_predictions_train1 = np.sum(predictions_train == Y_train.values.ravel())
    print('Training score is {0}'.format((accuracy_score(predictions_train, Y_train)) ))
    if do_cross_validation:
        cross_validation_score = cross_validate(make_pipeline(given_model), X, Y.values.ravel())
    else:
        cross_validation_score = -1

    return (my_pipeline, out_of_sample_score, cross_validation_score)

def get_train_test_data(train_data,
                        X_columns=PREDICTOR_COLS, 
                        Y_columns=['label']):
    # Simple training and testing
    X = train_data[X_columns]
    Y = train_data[Y_columns]
 
    # Do imputation on relevant columns.
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
    return (X_train, X_test, Y_train, Y_test, X, Y)



### Reading in data !

In [83]:
full_data = pd.read_csv('../input/train.csv')

### Checking for null values.

In [84]:
full_data.isnull().values.any()

False

### Split into training and validation sets

In [85]:
LEN_TRAIN_SET = int(0.8 * len(full_data))
train_data = full_data[0:LEN_TRAIN_SET]
validation_data = full_data[LEN_TRAIN_SET:len(full_data)]

In [None]:
assert(len(train_data) + len(validation_data) == len(full_data))

### Set up for doing cross validation
Later , we try several versions playing with the n_estimators parameter.

In [None]:
NUM_RANDOM_FOREST_ESTIMATORS = 10
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(full_data,
                              RandomForestClassifier(random_state=0, n_estimators=NUM_RANDOM_FOREST_ESTIMATORS))
    
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Training score is 0.9987936507936508


### Train final model on train data and make predictions on validation set.


In [None]:
(X_train, X_validation, Y_train, Y_validation, X, Y) = get_train_test_data(full_data)
my_model = RandomForestClassifier(random_state=0, n_estimators=NUM_RANDOM_FOREST_ESTIMATORS)

# Fit the model
my_model.fit(X_train, Y_train.values.ravel())

# Make and dump predictions to a file.
predictions_validation = my_model.predict(X_validation)
Y_validation = Y_validation.assign(prediction=predictions_validation)
Y_validation['ImageId'] = Y_validation.index
Y_validation[['ImageId', 'label', 'prediction']].to_csv('validation_randomforest_sklearn.csv', index=False)

### Making predictions on  test data.

In [None]:
# Train on complete dataset
(X_train, X_validation, Y_train, Y_validation, X, Y) = get_train_test_data(full_data)
my_model = RandomForestClassifier(random_state=0, n_estimators=NUM_RANDOM_FOREST_ESTIMATORS)

# Fit the model
my_model.fit(X, Y.values.ravel())


In [None]:
test_data = pd.read_csv('../input/test.csv')
test_predictions = my_model.predict(test_data[PREDICTOR_COLS])
test_data['label'] = test_predictions
test_data['ImageId'] = test_data.index + 1
test_data[['ImageId', 'label']].to_csv('submission_randomforest_sklearn.csv', index=False)