# Combinations
Let's combine the different extra features that gave better results and see whether this improves the overall performance.

In [36]:
from util import get_wpm_train_test, get_label_columns

train_x_full, train_y, test_x, test_y, groups = get_wpm_train_test(include_groups=True, x_train_features_only=False)

features = get_label_columns()  # initially only the manual labels

## Add features

In [37]:
from util import add_length_to_dataframe
features.append("Length")
train_x_full = add_length_to_dataframe(train_x_full)
test_x = add_length_to_dataframe(test_x)

In [38]:
# from util import add_headline_embedding_to_dataframe
# train_x_full, extra_features = add_headline_embedding_to_dataframe(train_x_full)
# test_x, _ = add_headline_embedding_to_dataframe(test_x)
# features += extra_features

In [39]:
from util import add_diff_length
features.append('NumWordsDiff')
features.append('AvgWordLengthDiff')
features.append('MaxWordLengthDiff')
train_x_full = add_diff_length(train_x_full)
test_x = add_diff_length(test_x)

## Evaluate

In [40]:
from util import fit_predict_evaluate_extra_features, get_xgboost_importance
_, model_xgb = fit_predict_evaluate_extra_features(train_x_full, train_y, test_x, test_y, features, groups)

Accuracy: 56.59% (103/182)




Accuracy: 52.20% (95/182)


## Hyperparameter tuning

In [41]:
from util import get_winners_only, predict_wp, evaluate_wp
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

group_kfold = GroupKFold(n_splits=5)

train_x_full.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)


def objective_cv(space):
    if 'max_depth' in space:
        space['max_depth'] = int(space['max_depth'])

    model = XGBClassifier(**space)

    accuracies = []
    for train_index, val_index in group_kfold.split(train_x_full, train_y, groups=train_x_full['Test']):

        train_x_small, train_y_small = train_x_full.iloc[train_index], train_y.iloc[train_index]
        val_x, val_y = train_x_full.iloc[val_index], get_winners_only(train_y.iloc[val_index])
        # groups_val = train_x_full.iloc[val_index]['Test']

        model.fit(train_x_small[features], train_y_small['Winner']) #, groups_val, sample_weight=None, verbose=0)

        predicted_winners = predict_wp(model, val_x, features=features)

        assert len(predicted_winners) == len(val_y)

        accuracy = evaluate_wp(val_y, predicted_winners)

        accuracies.append(accuracy)

    accuracy_mean = np.mean(accuracies)
    print(f"Accuracy mean: {accuracy_mean} for parameters {space}")

    return {'loss': -accuracy_mean, 'status': STATUS_OK}

In [None]:
space = {
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'n_estimators': 500,
}
trials = Trials()

best_params = fmin(fn=objective_cv,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials)
best_params

In [None]:
# Sad noises, I accidentially removed the results from the previous run that took a while, the accuracy was 55.9%, which is still not that good.

For some reason does this take way longer than when running this in chapter 2e, even though I think the only difference were the extra features I've added. But we added way more extra features, which makes sense why it takes so long. I'm first searching hyper parameters without embeded headline, since this is a lot faster.

In [45]:
# best_params = {'colsample_bytree': 0.8376842762481432, # TODO: This are still the parameters from another chapter
#                'gamma': 1.0100982566020316,
#                'learning_rate': 0.10560571568287097,
#                'max_depth': 5,
#                'reg_alpha': 41.0,
#                'reg_lambda': 0.46191366472424383}

In [46]:
from util import fit_predict_print_wp

model = XGBClassifier(n_estimators=500, random_state=42, **best_params)
fit_predict_print_wp(model, train_x_full, train_y, test_x, test_y, groups=train_x_full['Test'])



Accuracy: 51.65% (94/182)
