# Combinations
Let's combine the different extra features that gave better results and see whether this improves the overall performance.

In [54]:
from util import get_wpm_train_test, get_label_columns

train_x_full, train_y, test_x, test_y, groups = get_wpm_train_test(include_groups=True, x_train_features_only=False)

features = get_label_columns()  # initially only the manual labels

## Add features

In [55]:
from util import add_length_to_dataframe
features.append("Length")
train_x_full = add_length_to_dataframe(train_x_full)
test_x = add_length_to_dataframe(test_x)

In [56]:
from util import add_headline_embedding_to_dataframe
train_x_full, extra_features = add_headline_embedding_to_dataframe(train_x_full)
test_x, _ = add_headline_embedding_to_dataframe(test_x)
features += extra_features

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

  self[col] = igetitem(value, i)


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [57]:
from util import add_diff_length
features.append('NumWordsDiff')
features.append('AvgWordLengthDiff')
features.append('MaxWordLengthDiff')
train_x_full = add_diff_length(train_x_full)
test_x = add_diff_length(test_x)

## Evaluate

In [40]:
from util import fit_predict_evaluate_extra_features, get_xgboost_importance
_, model_xgb = fit_predict_evaluate_extra_features(train_x_full, train_y, test_x, test_y, features, groups)

Accuracy: 56.59% (103/182)




Accuracy: 52.20% (95/182)


## Hyperparameter tuning

In [61]:
from util import get_winners_only, predict_wp, evaluate_wp
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

group_kfold = GroupKFold(n_splits=5)

train_x_full.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)


def objective_cv(space):
    if 'max_depth' in space:
        space['max_depth'] = int(space['max_depth'])

    model = XGBClassifier(**space)

    accuracies = []
    for train_index, val_index in group_kfold.split(train_x_full, train_y, groups=train_x_full['Test']):

        train_x_small, train_y_small = train_x_full.iloc[train_index], train_y.iloc[train_index]
        val_x, val_y = train_x_full.iloc[val_index], get_winners_only(train_y.iloc[val_index])
        # groups_val = train_x_full.iloc[val_index]['Test']

        model.fit(train_x_small[features], train_y_small['Winner']) #, groups_val, sample_weight=None, verbose=0)

        predicted_winners = predict_wp(model, val_x, features=features)

        assert len(predicted_winners) == len(val_y)

        accuracy = evaluate_wp(val_y, predicted_winners)

        accuracies.append(accuracy)

    accuracy_mean = np.mean(accuracies)
    print(f"Accuracy mean: {accuracy_mean} for parameters {space}")

    return {'loss': -accuracy_mean, 'status': STATUS_OK, 'loss_variance': np.var(accuracies, ddof=1)} # Added var based on this article https://www.databricks.com/blog/2021/04/15/how-not-to-tune-your-model-with-hyperopt.html

In [59]:
space = {
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'n_estimators': 500,
}

In [None]:
raise "Takes a long time to run"
trials = Trials()

best_params = fmin(fn=objective_cv,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials)
best_params

Note: Without embedding

100%|██████████| 100/100 [04:04<00:00,  2.44s/trial, best loss: -0.5605668398677375]

{'colsample_bytree': 0.660947785503167,
 'gamma': 1.009721450628837,
 'learning_rate': 0.15887155111036247,
 'max_depth': 6.0,
 'reg_alpha': 40.0,
 'reg_lambda': 0.47894997446546944}

For some reason does this take way longer than when running this in chapter 2e, even though I think the only difference were the extra features I've added. But we added way more extra features, which makes sense why it takes so long. I'm first searching hyper parameters without embeded headline, since this is a lot faster.

In [52]:
best_params = {'colsample_bytree': 0.660947785503167,
 'gamma': 1.009721450628837,
 'learning_rate': 0.15887155111036247,
 'max_depth': 6,
 'reg_alpha': 40.0,
 'reg_lambda': 0.47894997446546944}

In [53]:
from util import fit_predict_print_wp

model = XGBClassifier(n_estimators=500, random_state=42, **best_params)
fit_predict_print_wp(model, train_x_full, train_y, test_x, test_y, groups=train_x_full['Test'])



Accuracy: 54.40% (98/182)


In [63]:
# Let's try with embedding
trials = Trials()

best_params = fmin(fn=objective_cv,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=10,
                   trials=trials)
best_params

Accuracy mean: 0.35260273972602735 for parameters {'colsample_bytree': 0.7664073804072375, 'gamma': 4.400556660788374, 'learning_rate': 0.04599037810814008, 'max_depth': 2, 'n_estimators': 500, 'reg_alpha': 154.0, 'reg_lambda': 0.5808846315837922}
Accuracy mean: 0.35260273972602735 for parameters {'colsample_bytree': 0.9417813439836679, 'gamma': 1.3343461247034236, 'learning_rate': 0.12882358325375412, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 174.0, 'reg_lambda': 0.6386074504849956}
Accuracy mean: 0.5633915918752953 for parameters {'colsample_bytree': 0.5246147262298059, 'gamma': 1.360691548360549, 'learning_rate': 0.023337422631523323, 'max_depth': 4, 'n_estimators': 500, 'reg_alpha': 52.0, 'reg_lambda': 0.9079866239641216}
Accuracy mean: 0.4406991025035428 for parameters {'colsample_bytree': 0.6713526584957731, 'gamma': 8.835077982448018, 'learning_rate': 0.17267689900847327, 'max_depth': 7, 'n_estimators': 500, 'reg_alpha': 101.0, 'reg_lambda': 0.8749143432884918}
Accuracy 

{'colsample_bytree': 0.5246147262298059,
 'gamma': 1.360691548360549,
 'learning_rate': 0.023337422631523323,
 'max_depth': 4.0,
 'reg_alpha': 52.0,
 'reg_lambda': 0.9079866239641216}

100%|██████████| 10/10 [09:11<00:00, 55.15s/trial, best loss: -0.5633915918752953]

{'colsample_bytree': 0.5246147262298059,
 'gamma': 1.360691548360549,
 'learning_rate': 0.023337422631523323,
 'max_depth': 4.0,
 'reg_alpha': 52.0,
 'reg_lambda': 0.9079866239641216}

In [66]:
best_params = {'colsample_bytree': 0.5246147262298059,
 'gamma': 1.360691548360549,
 'learning_rate': 0.023337422631523323,
 'max_depth': 4,
 'reg_alpha': 52.0,
 'reg_lambda': 0.9079866239641216}

In [67]:
model = XGBClassifier(n_estimators=500, random_state=42, **best_params)
fit_predict_print_wp(model, train_x_full, train_y, test_x, test_y, groups=train_x_full['Test'])



Accuracy: 56.04% (102/182)
