In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [419]:
import smogn

df = pd.read_excel("../spring21_data/fa_stats_v3.xlsx")
df = df.select_dtypes(exclude=['object'])
df.drop(columns=["Unnamed: 0", "Minutes", "salary", "clean_season", "age", "end season_y", "season_end"], inplace=True)

X = df.drop(columns=["cap_space_usage"])
y = df["cap_space_usage"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=77)
both = pd.concat([X_train, y_train], axis=1).reset_index().drop(columns=["index"])
smote_data = smogn.smoter(data = both,
                           y = 'cap_space_usage',
                           rel_xtrm_type = 'high',
                           rel_thres = 0.9,
                           k = 13,
                           samp_method = 'extreme')
sm = smote_data.sample(frac=1) ## make sure to shuffle the rows otherwise the cross validation will be scuffed
y = sm[['cap_space_usage']]
X = sm.drop(columns=['cap_space_usage'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=77)

# scale the numerical features only (this leaves out the dummy features from earlier)

from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(X, y)
feature_scores = pd.concat([pd.Series(X.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]
features = list(feature_scores.sort_values(by=["score"], ascending=False).head(40)['features'])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train[features])
X_test = scaler.transform(X_test[features])
X_val = scaler.transform(X_val[features])

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))
    
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

dist_matrix: 100%|##########| 30/30 [00:00<00:00, 33.64it/s]
synth_matrix: 100%|##########| 30/30 [00:02<00:00, 11.84it/s]
r_index: 100%|##########| 2/2 [00:00<00:00, 94.78it/s]
  return f(*args, **kwargs)


In [223]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in tree
n_estimators = range(25, 300, 10)

# Maximum number of levels in tree
max_depth = range(3, 10, 2)

min_child_weight = range(1,6,2)
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'gamma':[i/10.0 for i in range(0,20)],
               'colsample_bytree': [i/100.0 for i in range(60, 90, 5)],
               'learning_rate': [i/100.0 for i in range(5, 20, 5)]}

In [224]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
xgb = XGBRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 200, cv = 6, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
xgb_random.fit(X_train, y_train)

Fitting 6 folds for each of 200 candidates, totalling 1200 fits


RandomizedSearchCV(cv=6,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          validate_parameters=None,
                                          verbosity=None),
                   n_iter=200, n_jobs=-1,
                   pa

In [66]:
from sklearn.model_selection import GridSearchCV

In [72]:
xgb = XGBRegressor(n_estimators=75, max_depth=5, learning_rate=0.1, subsample=1, min_child_weight= 3, gamma=0.0, colsample_bytree=0.75,
scale_pos_weight = 1)

grid = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch = GridSearchCV(estimator = xgb, 
                       param_grid = grid, n_jobs=-1, cv=5)
gsearch.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=0.75, gamma=0.0,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=5, min_child_weight=3,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=75, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method=None, validate_parameters=None,
  

In [73]:
gsearch.best_params_

{'reg_alpha': 0.1}

In [225]:
xgb_random.best_params_

{'n_estimators': 235,
 'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0.0,
 'colsample_bytree': 0.7}

In [462]:
xgb = XGBRegressor(n_estimators=275, max_depth=8, learning_rate=0.11, min_child_weight= 3, gamma=0, colsample_bytree=0.7, reg_alpha=0.1, seed=0)

run(xgb, "XGB", X_train, X_val)

[XGB] These are the results for the training set.
r^2:  0.9967286432613292
mse:  4.488002298115169e-05
variance_score:  0.9967292630558048
mse-log:  3.313981980353039e-05
--------------------------------------
[XGB] These are the results for the validation set.
r^2:  0.825271118041037
mse:  0.002021783528651452
variance_score:  0.8407784129919451
mse-log:  0.0015653554988806998
_____________________________________________


In [463]:
evaluate(xgb.predict(X_test), y_test)

r^2:  0.7959330572401353
mse:  0.0012979697072703425
variance_score:  0.8108885390479148
mse-log:  0.0010518812378606396
