In [72]:
import pandas as pd
import numpy as np
import sklearn

In [73]:
import warnings
warnings.filterwarnings('ignore')

In [170]:
from sklearn.model_selection import train_test_split
import smogn

df = pd.read_excel("../spring21_data/fa_stats_v3.xlsx")
df = df.select_dtypes(exclude=['object'])
df.drop(columns=["Unnamed: 0", "Minutes", "salary", "clean_season", "age", "end season_y", "season_end"], inplace=True)

X = df.drop(columns=["cap_space_usage"])
y = df["cap_space_usage"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=77)
both = pd.concat([X_train, y_train], axis=1).reset_index().drop(columns=["index"])
smote_data = smogn.smoter(data = both,
                           y = 'cap_space_usage',
                           rel_xtrm_type = 'both',
                           rel_thres = 0.04,
                           pert = 1,
                           k = 9,
                           drop_na_col = True,       ## boolean (True or False)
                           drop_na_row = True,
                           samp_method = 'extreme')
sm = smote_data.sample(frac=1) ## make sure to shuffle the rows otherwise the cross validation will be scuffed
y = sm[['cap_space_usage']]
X = sm.drop(columns=['cap_space_usage'])

dist_matrix: 100%|##########| 126/126 [00:15<00:00,  7.99it/s]
r_index: 100%|##########| 88/88 [00:00<00:00, 447.41it/s]


In [171]:
from sklearn.feature_selection import SelectKBest, f_regression
select = SelectKBest(f_regression, k=20)

best_features = select.fit_transform(X, y)
feature_scores = pd.concat([pd.Series(X.columns), pd.Series(select.scores_)], axis=1)
feature_scores.columns = ["features", "score"]
features = list(feature_scores.sort_values(by=["score"], ascending=False).head(10)['features'])

X = X[features]
X_test = X_test[features]
print(len(X.columns.values))

10


In [8]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("mse: ", mean_squared_error(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    print("mse-log: ", mean_squared_log_error(y_test, y_pred))

In [9]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [76]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

### Without train-validation-test split
**Since we have bootstrap in random forest, as it is based on the idea of bagging**, we can use oob_score as score for validation set and **save the trouble of using train-validation-test split.**

**Below is the experimentation**

In [172]:
rf = RandomForestRegressor(n_estimators=100, criterion='mse', 
                                  max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                  max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
                                  bootstrap=True, oob_score=True, n_jobs=-1, random_state=77, verbose=0, warm_start=False, 
                                  ccp_alpha=0.0, max_samples=None)
rf.fit(X, y)
evaluate(rf.predict(X), y)

r^2:  0.958371408365447
mse:  0.00031169116287500516
variance_score:  0.958371641838038
mse-log:  0.00024238121068357324


In [173]:
rf.oob_score_

0.6929884416713787

In [174]:
evaluate(rf.predict(X_test), y_test)

r^2:  0.665782048176867
mse:  0.002125796423592449
variance_score:  0.72536049803876
mse-log:  0.001771189610736128


In [94]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1100, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 500, num = 25)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [i for i in range(1, 10)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [i for i in range(1, 10)]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [95]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(warm_start=True, bootstrap=True, oob_score=True) # use warm_start to speed up
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(oob_score=True,
                                                   warm_start=True),
                   n_iter=500, n_jobs=-1,
                   param_distributions={'max_depth': [10, 30, 50, 71, 91, 112,
                                                      132, 152, 173, 193, 214,
                                                      234, 255, 275, 295, 316,
                                                      336, 357, 377, 397, 418,
                                                      438, 459, 479, 500,
                                                      None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9],
                                        'min_samples_spli

In [96]:
rf_random.best_params_

{'n_estimators': 385,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 152}

In [193]:
best = RandomForestRegressor(n_estimators=1200,
                                  max_depth=300, min_samples_split=3, min_samples_leaf=4, 
                                  max_features='log2', 
                                  bootstrap=True, oob_score=True, n_jobs=-1, random_state = 77)
best.fit(X, y)
evaluate(best.predict(X), y)

r^2:  0.8347087915095992
mse:  0.0012376063413258706
variance_score:  0.8347089598737016
mse-log:  0.0009500395168141767


In [194]:
best.oob_score_

0.7070142037639034

In [195]:
evaluate(best.predict(X_test[features]), y_test)

r^2:  0.6558999764678572
mse:  0.002188651433570531
variance_score:  0.7300602610555369
mse-log:  0.0018315830134589192
