In [27]:
import numpy as np
import pandas as pd
import sklearn
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [32]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
seed = 8

In [25]:
X = pd.read_excel('X.xlsx')
y = pd.read_excel('y.xlsx')
X_test = pd.read_excel('X_test.xlsx')
y_test = pd.read_excel('y_test.xlsx')
X.set_index('Unnamed: 0', inplace=True)
X_test.set_index('Unnamed: 0', inplace=True)
y.set_index('Unnamed: 0', inplace=True)
y_test.set_index('Unnamed: 0', inplace=True)

In [30]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=40)

best_features = select.fit_transform(X, y)
feature_scores = pd.concat([pd.Series(X.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]
features = list(feature_scores.sort_values(by=["score"], ascending=False).head(25)['features'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=seed)

X_train = X_train[features]
X_test = X_test[features]
X_val = X_val[features]
print(X_train.columns.values)

['points' 'made_field_goals' 'attempted_field_goals' 'MPG'
 'minutes_played' 'Wins Added' 'WS' 'made_free_throws'
 'attempted_free_throws' 'offensive_box_plus_minus' 'poss' 'win_shares'
 'value_over_replacement_player' 'box_plus_minus' 'games_started'
 'war_reg_season' 'O-LEBRON' 'war_total' 'turnovers' 'predator_offense'
 'offensive_win_shares' 'player_efficiency_rating' 'raptor_offense'
 'steals' 'ORPM']
  return f(*args, **kwargs)


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import Normalizer

In [46]:
from sklearn.model_selection import cross_val_score
def performance(model):
    p = make_pipeline(model)
    scores = cross_val_score(p, X_train, y_train, cv=8, scoring="explained_variance")
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    return scores, p

## Ensemble

In [7]:
params = {'n_estimators': 688,
 'min_samples_split': 6,
 'min_samples_leaf': 5,
 'max_features': 'log2',
 'max_depth': None,
 'learning_rate': 0.025000000000000015,
 'alpha': 0.5500000000000002,
 'subsample': 0.95,
 'loss': 'quantile'}
gbr = GradientBoostingRegressor(**params)

In [8]:
rf = RandomForestRegressor(n_estimators=806,
                                  max_depth=234, min_samples_split=4, min_samples_leaf=2, 
                                  max_features='log2', 
                                  bootstrap=True, oob_score=True, n_jobs=-1, random_state = 8)

In [11]:
xgb = XGBRegressor(reg_alpha= 1e-05,
 n_estimators= 475,
 min_child_weight= 5,
 max_depth= 8,
 learning_rate= 0.11,
 gamma= 0.002,
 colsample_bytree= 0.86)

In [12]:
estimators = [
    ("gbr", gbr),
    ("rf", rf),
    ('xgb', xgb)
]

In [11]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [59]:
voting = VotingRegressor(estimators=estimators, n_jobs=-1, weights=[0.13, 0.12, 0.75])
scores, p = performance(voting)

0.83 accuracy with a standard deviation of 0.07


In [60]:
p.fit(X_train, y_train)

Pipeline(steps=[('votingregressor',
                 VotingRegressor(estimators=[('gbr',
                                              GradientBoostingRegressor(alpha=0.5500000000000002,
                                                                        learning_rate=0.025000000000000015,
                                                                        loss='quantile',
                                                                        max_depth=None,
                                                                        max_features='log2',
                                                                        min_samples_leaf=5,
                                                                        min_samples_split=6,
                                                                        n_estimators=688,
                                                                        subsample=0.95)),
                                             ('rf',
                  

In [61]:
p.score(X_val, y_val)

0.8497247013953071

In [62]:
p.score(X_test, y_test)

0.8884933973141513

In [39]:
stack = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=806,
                                  max_depth=234, min_samples_split=4, min_samples_leaf=2, 
                                  max_features='log2', 
                                  bootstrap=True, oob_score=True, n_jobs=-1, random_state = 8), n_jobs=-1)
scores_stacking, p_stacking = performance(stack)

0.81 accuracy with a standard deviation of 0.07


In [43]:
p_stacking.fit(X_train, y_train)

Pipeline(steps=[('normalizer', Normalizer()),
                ('stackingregressor',
                 StackingRegressor(estimators=[('gbr',
                                                GradientBoostingRegressor(alpha=0.5500000000000002,
                                                                          learning_rate=0.025000000000000015,
                                                                          loss='quantile',
                                                                          max_depth=None,
                                                                          max_features='log2',
                                                                          min_samples_leaf=5,
                                                                          min_samples_split=6,
                                                                          n_estimators=688,
                                                                          subsample=0.95)),
  

In [44]:
p_stacking.score(X_val, y_val)

0.8349615619997149

In [45]:
p_stacking.score(X_test, y_test)

0.717790135335179

In [14]:
stack = StackingRegressor(estimators=estimators, final_estimator=ExtraTreesRegressor())
performance(stack)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


0.57 accuracy with a standard deviation of 0.04


array([0.6223825 , 0.56008695, 0.58215641, 0.5107953 , 0.55020298])

In [15]:
stack = StackingRegressor(estimators=estimators, final_estimator=GradientBoostingRegressor())
performance(stack)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


0.59 accuracy with a standard deviation of 0.07


array([0.65195003, 0.63162309, 0.63239501, 0.4730435 , 0.55254162])

In [16]:
stack = StackingRegressor(estimators=estimators, final_estimator=voting)
performance(stack)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


0.62 accuracy with a standard deviation of 0.10


array([0.72432419, 0.63563556, 0.70060434, 0.45291745, 0.59728051])

In [19]:
stack = StackingRegressor(estimators=estimators, final_estimator=stack)
performance(stack)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


0.57 accuracy with a standard deviation of 0.06


array([0.56713567, 0.6259441 , 0.63199989, 0.46984662, 0.56212234])

In [20]:
stack = StackingRegressor(estimators=estimators, final_estimator=stack)
performance(stack)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


0.51 accuracy with a standard deviation of 0.12


array([0.6124694 , 0.49305645, 0.67400245, 0.36714067, 0.40269671])