### Ridge Regression Model With Hyperparameter and Feature Selection ###

In [46]:
import numpy as np

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split

#### Scoring function ####

In [47]:
def root_mean_squared(act_y, pred_y):
    """ Root Mean Squared Error """
    rmse = np.sqrt(mean_squared_error(act_y, pred_y))
    return rmse

def absolute_error(act_y, pred_y):
    """ Mean Absolute Error"""
    mae = mean_absolute_error(act_y, pred_y)
    return mae

In [48]:
rmse_score_function = make_scorer(root_mean_squared, greater_is_better=False)
mae_score_function = make_scorer(absolute_error, greater_is_better=False)

#### Loading the Training and Testing Data ###

In [49]:
train = np.load('../Models/train.npy')

X_train = np.load('../Models/X_train.npy')
y_train = np.load('../Models/y_train.npy')
X_test = np.load('../Models/X_test.npy')
y_test = np.load('../Models/y_test.npy')

print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape

(2547, 49)
(2547,)
(637, 49)
(637,)


In [50]:
ridge = Ridge()
# ridge.fit(X_train, y_train)
# pred = ridge.predict(X_test)
# print 'RMSE {0}, MAE {1}'.format(root_mean_squared(y_test, pred), absolute_error(y_test, pred))

In [53]:
rfecv_mae = RFECV(estimator=LinearRegression(), step=1, cv=10, scoring=mae_score_function)
rfecv_rmse = RFECV(estimator=LinearRegression(), step=1, cv=10, scoring=rmse_score_function)

rfecv_mae = rfecv_mae.fit(X_train, y_train)
rfecv_rmse = rfecv_rmse.fit(X_train, y_train)

prediction_mae = rfecv_mae.predict(X_test)
prediction_rmse = rfecv_rmse.predict(X_test)

print 'MAE {0}'.format(absolute_error(y_test, prediction_mae))
print 'RMSE {0}'.format(root_mean_squared(y_test, prediction_rmse))

# best_estim = rfecv.estimator_

MAE 3.55857969394
RMSE 4.58429548429


In [89]:
parameters = {
    'estimator__alpha': [0.01, 0.1, 1.0, 10, 100],
    'estimator__normalize': [True, False],
    'estimator__fit_intercept': [True, False]
}

# gscv = GridSearchCV(estimator=rfecv, param_grid=parameters, scoring=mae_score_function)
# gscv.fit(X_train, y_train)
# new_reg = gscv.best_estimator_
# print type(new_reg)
# pred_value = new_reg.predict(X_test)
# print 'RMSE {0}, MAE {1}'.format(root_mean_squared(y_test, pred_value), absolute_error(y_test, pred_value))



In [112]:
import pandas as pd
df_data = pd.read_csv("../feature_engineered_dataset.csv")
feature_names = df_data.columns
import operator

In [105]:
estimator = Ridge()
selector_mae = RFECV(estimator, step=1, cv=10, scoring=mae_score_function)
selector_rmse = RFECV(estimator, step=1, cv=10, scoring=rmse_score_function)
clf_mae = GridSearchCV(estimator=selector_mae, param_grid=parameters, cv=10, scoring=mae_score_function)
clf_rmse = GridSearchCV(estimator=selector_rmse, param_grid=parameters, cv=10, scoring=rmse_score_function)
clf_mae.fit(X_train, y_train)
clf_rmse.fit(X_train, y_train)

new_reg_mae = clf_mae.best_estimator_
pred_value_mae = new_reg_mae.predict(X_test)

new_reg_rmse = clf_rmse.best_estimator_
pred_value_rmse = new_reg_rmse.predict(X_test)

print 'MAE {0}'.format(absolute_error(y_test, pred_value_mae))
print 'RMSE {0}'.format(root_mean_squared(y_test, pred_value_rmse))

KeyboardInterrupt: 

In [116]:
estimator = LinearRegression()
selector_mae = RFECV(estimator, step=1, cv=10, scoring=mae_score_function)
selector_rmse = RFECV(estimator, step=1, cv=10, scoring=rmse_score_function)

selector_mae.fit(X_train, y_train)
selector_rmse.fit(X_train, y_train)

pred_value_mae = selector_mae.predict(X_test)
pred_value_rmse = selector_rmse.predict(X_test)

print 'MAE {0}'.format(absolute_error(y_test, pred_value_mae))
print 'RMSE {0}'.format(root_mean_squared(y_test, pred_value_rmse))

mae_feature_selection= sorted(dict(zip(list(feature_names), list(selector_mae.ranking_))).items(), key=operator.itemgetter(1), reverse=True)

rmse_feature_selection= sorted(dict(zip(list(feature_names), list(selector_rmse.ranking_))).items(), key=operator.itemgetter(1), reverse=True)


MAE 3.55857969394
RMSE 4.58429548429
[('personal_fouls_per_game', 3), ('games', 2), ('rating', 1), ('steal_percentage', 1), ('free_throw_attempts_per_game', 1), ('offensive_rebounds_per_game', 1), ('assist_percentage', 1), ('defensive_box_plus_minux', 1), ('defensive_win_shares', 1), ('3-point_goal_percentage', 1), ('total_rebounds_per_game', 1), ('field_goals_per_game', 1), ('turnover_percentage', 1), ('2-point_field_goals_per_game', 1), ('block_percentage', 1), ('true_shooting_percentage', 1), ('field_goal_percentage', 1), ('3-point_field_goal_attempted_per_game', 1), ('total_rebound_percentage', 1), ('position_SF', 1), ('minutes_played', 1), ('position_C', 1), ('games_started', 1), ('free_throw_percentage', 1), ('offensive_box_plus_minux', 1), ('win_shares_per_48_minutes', 1), ('assists_per_game', 1), ('value_over_replacement_player', 1), ('3_point_attempt_rate', 1), ('free_throws_per_game', 1), ('blocks_per_game', 1), ('player_efficiency_rating', 1), ('free_throw_attempt_rate', 1),

In [117]:
import pprint

In [119]:
pprint.pprint(mae_feature_selection)

print "--------------------------"


pprint.pprint(rmse_feature_selection)

[('personal_fouls_per_game', 3),
 ('games', 2),
 ('rating', 1),
 ('steal_percentage', 1),
 ('free_throw_attempts_per_game', 1),
 ('offensive_rebounds_per_game', 1),
 ('assist_percentage', 1),
 ('defensive_box_plus_minux', 1),
 ('defensive_win_shares', 1),
 ('3-point_goal_percentage', 1),
 ('total_rebounds_per_game', 1),
 ('field_goals_per_game', 1),
 ('turnover_percentage', 1),
 ('2-point_field_goals_per_game', 1),
 ('block_percentage', 1),
 ('true_shooting_percentage', 1),
 ('field_goal_percentage', 1),
 ('3-point_field_goal_attempted_per_game', 1),
 ('total_rebound_percentage', 1),
 ('position_SF', 1),
 ('minutes_played', 1),
 ('position_C', 1),
 ('games_started', 1),
 ('free_throw_percentage', 1),
 ('offensive_box_plus_minux', 1),
 ('win_shares_per_48_minutes', 1),
 ('assists_per_game', 1),
 ('value_over_replacement_player', 1),
 ('3_point_attempt_rate', 1),
 ('free_throws_per_game', 1),
 ('blocks_per_game', 1),
 ('player_efficiency_rating', 1),
 ('free_throw_attempt_rate', 1),
 ('p