In [1]:
from operator import itemgetter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# display multiple outputs in same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# restore objects and unpack them into variables
%store -r object_keep
df, df_adjust,X_train, X_test, y_train = itemgetter('df', 
                                                    'df_adjust',
                                                    'X_train',
                                                    'X_test', 
                                                    'y_train')(object_keep)

## Supervised Learning 
Will try range of different supervised learning models on our *labelled* data.

In [2]:
model_factory = [
    RandomForestRegressor(),
    XGBRegressor(nthread = 1),
    Ridge(),
    BayesianRidge(),
    ExtraTreesRegressor(),
    ElasticNet(),
    KNeighborsRegressor(),
    GradientBoostingRegressor()
]

for model in model_factory:
    model.seed = 42
    n_folds = 3

    scores = cross_val_score(estimator = model, X = X_train, y = y_train, cv = n_folds, scoring = "neg_mean_squared_error")
    score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean() * -1), scores.std() * 2)
    
    print('{model:25} CV-5 RMSE: {score}'.format(model = model.__class__.__name__, score = score_description))

RandomForestRegressor     CV-5 RMSE:  1150.33 (+/- 27744.53)
XGBRegressor              CV-5 RMSE:  1179.27 (+/- 32476.24)
Ridge                     CV-5 RMSE:  1206.49 (+/- 54349.06)
BayesianRidge             CV-5 RMSE:  1206.63 (+/- 53013.02)
ExtraTreesRegressor       CV-5 RMSE:  1191.45 (+/- 12762.87)
ElasticNet                CV-5 RMSE:  1259.21 (+/- 89904.13)
KNeighborsRegressor       CV-5 RMSE:  1246.34 (+/- 36060.54)
GradientBoostingRegressor CV-5 RMSE:  1085.72 (+/- 23514.35)


## Conclusion

See that Random Forest gives the best model performance without any hyperparameter tuning. No hyperparameter-tuning will be done for the sake of keeping this focussed on the process for dealing with this semi-supervised problem.

In [3]:
object_keep = {'df':df,
               'df_adjust': df_adjust, 
               'X_train': X_train, 
               'X_test': X_test, 
               'y_train': y_train}
%store object_keep

Stored 'object_keep' (dict)
