### Ridge Regression Model With Hyperparameter and Feature Selection ###

In [1]:
import numpy as np

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from Utils import *

#### Loading the Training and Testing Data ###

In [11]:
train = np.load('../Models/train.npy')

X = train[0:,0:-1]
y = train[:, -1]

X_train = np.load('../Models/X_train.npy')
y_train = np.load('../Models/y_train.npy')
X_test = np.load('../Models/X_test.npy')
y_test = np.load('../Models/y_test.npy')

In [3]:
rfecv_mae = RFECV(estimator=LinearRegression(), step=1, cv=10, scoring=mae_scorer_gs)
rfecv_rmse = RFECV(estimator=LinearRegression(), step=1, cv=10, scoring=rmse_scorer_gs)

rfecv_mae = rfecv_mae.fit(X_train, y_train)
rfecv_rmse = rfecv_rmse.fit(X_train, y_train)

prediction_mae = rfecv_mae.predict(X_test)
prediction_rmse = rfecv_rmse.predict(X_test)

print 'MAE {0}'.format(absolute_error(y_test, prediction_mae))
print 'RMSE {0}'.format(root_mean_squared(y_test, prediction_rmse))

# best_estim = rfecv.estimator_

MAE 3.55857969394
RMSE 4.58429548429


In [38]:
parameters = {
    'alpha': [0.01, 0.1, 1.0, 10, 100],
    'normalize': [True, False],
    'fit_intercept': [True, False]
}

In [39]:
import pandas as pd
df_data = pd.read_csv("../feature_engineered_dataset.csv")
feature_names = df_data.columns
import operator

In [40]:
def get_best_estimator(estimator, step, cv, scoring, parameters):
    clf_mae = GridSearchCV(estimator=estimator, param_grid=parameters, cv=cv, scoring=scoring)
    clf_mae.fit(X_train, y_train)
    return clf_mae.best_estimator_

In [41]:
ridge = Ridge()
# lasso = Lasso()

best_ridge = get_best_estimator(ridge, 1, 10, mae_scorer_gs, parameters)

print best_ridge

# best_lasso = get_best_estimator(lasso, 1, 10, mae_scorer_gs, parameters)

# print best_lasso

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [45]:
print model_cross_validation(best_ridge, X, y, mae_scorer_cv, 10)

3.6038503309


In [49]:
lin_params = {
}

linear = LinearRegression()
best_linear = get_best_estimator(linear, 1, 10, mae_scorer_gs, lin_params)

In [50]:
print model_cross_validation(linear, X, y, mae_scorer_cv, 10)

3.61816601526


In [51]:
random_forest = RandomForestRegressor()

param_random_forest = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [5, 8, 15, 25, 30, None],
    'min_samples_split': [2, 5, 10, 15, 50],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['log2', 'sqrt', None]
}

best_forest = get_best_estimator(random_forest, 1, 10, mae_scorer_gs, param_random_forest)

KeyboardInterrupt: 

In [30]:
print best_forest.estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [31]:
print model_cross_validation(best_forest.estimator_, X, y, mae_scorer_cv, 10)

3.77303567063


In [36]:
tree = DecisionTreeRegressor()

param_tree = {
#     'estimator__n_estimators': [1, 5, 10, 50, 100, 300, 500, 1000],
#     'estimator__max_depth': [5, 8, 15, 25, 30, None],
#     'estimator__min_samples_split': [2, 5, 10, 15, 100],
#     'estimator__min_samples_leaf': [1, 2, 5, 10],
#     'estimator__max_features': ['log2', 'sqrt', None, 10]
}

best_knn = get_best_estimator(tree, 1, 10, mae_scorer_gs, param_random_forest)

In [37]:
print model_cross_validation(best_knn.estimator_, X, y, mae_scorer_cv, 10)

5.26590219041


In [11]:
elastic = ElasticNet()

elastic_parameters = {
    'alpha': [0.01, 0.1, 1.0, 10, 100],
    'normalize': [True, False],
    'fit_intercept': [True, False]
}

best_elastic = get_best_estimator(elastic, 1, 10, mae_scorer_gs, elastic_parameters)



In [13]:
print model_cross_validation(best_elastic, X, y, mae_scorer_cv, 10)

3.60953299952


In [None]:
svr = SVR()

svr_parameters = {
    'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
    'kernel': ['rbf', 'poly'],
    'epsilon': [0.0, 0.1, 0.2, 0.5, 1.0]
}


best_svr = get_best_estimator(svr, 1, 10, mae_scorer_gs, svr_parameters)



In [18]:
print model_cross_validation(svr, X, y, mae_scorer_cv, 10)

5.80977421933
