# Evaluation

In [107]:
# imports

from src.utils import utils
from src.modelling import training as train
import configparser
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
proj_root = utils.get_proj_root()

config = configparser.ConfigParser(interpolation=None)
config.read(proj_root.joinpath('config/data_config.ini'))

preprocessed_data_rel_path = config['data_paths']['preprocessed_data_path']
preprocessed_data_path=  proj_root.joinpath(preprocessed_data_rel_path)

preprocessed_data = train.get_training_data(file_path=preprocessed_data_path)


model_output_dir = proj_root.joinpath(config['modelling_paths']['model_output'])
model_name = 'xgboost'
model_output_path = model_output_dir.joinpath(model_name+'.pkl')
model = utils.load_value(model_output_path)
print(model)

Pipeline(steps=[('cat_to_ordinal_cols',
                 ColumnsOrdinalEncoder(col_names=['industry', 'symbol'])),
                ('select_optimal_cols',
                 OptimalColumnSelector(optimal_cols_path=PosixPath('/home/aroge/projects/dividend-cut-predictor/models/artifacts/optimal_features.pkl'))),
                ['model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsa...
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.05845944217474756, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=48,
                               max_leaves=None, min_child_weight=1, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
     

## overall model evaluation

In [109]:
final_year = int(config['year_limits']['end_year'])
label_col_name = 'dps_change_next_year'
training_data, testing_data = train.train_test_split(df=preprocessed_data, final_year=final_year)
X_test, y_test = train.split_Xy(testing_data, label_col_name=label_col_name)

y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
score = roc_auc_score(y_true=y_test, y_score=y_pred_prob)


print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       450
           1       0.64      0.35      0.45        26

    accuracy                           0.95       476
   macro avg       0.80      0.67      0.71       476
weighted avg       0.95      0.95      0.95       476



## Check Top Predictions

In [112]:
def get_top_n_idx(a, n=20, order='desc'):

    if order == 'asc':

        idx_top_n = np.argpartition(a, n)[:n]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])]
    else:
        idx_top_n = np.argpartition(a, -n)[-n:]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])][::-1]

    return idx_top_n

n = 10
top_n_pred_idxs = get_top_n_idx(y_pred_prob, n=n, order='desc')

y_pred_prob_top_n = y_pred_prob[top_n_pred_idxs]
y_pred_top_n = y_pred[top_n_pred_idxs]
y_true = y_test.values[top_n_pred_idxs]


print(f'top {n} probabilities of cutting dividends for {final_year}:')
df = X_test.iloc[top_n_pred_idxs, [1,2]]
df['pred. prob'] = y_pred_prob_top_n
df['pred. class'] = y_pred_top_n
df['true class'] = y_true
display(df)
print('roc_auc_score: \n', roc_auc_score(y_true=y_true, y_score=y_pred_prob_top_n))
print(f'classification report: \n',classification_report(y_true=y_true, y_pred=y_pred_top_n))


top 10 probabilities of cutting dividends for 2022:


Unnamed: 0,industry,symbol,pred. prob,pred. class,true class
4130,Aerospace & Defense,TDG,0.965635,1,0
3422,Oil & Gas E&P,PXD,0.964874,1,1
1173,Oil & Gas E&P,COP,0.924209,1,1
1593,Oil & Gas E&P,EOG,0.919763,1,1
1357,Oil & Gas E&P,DVN,0.916032,1,1
2702,Specialty Chemicals,LYB,0.904745,1,1
1371,Oil & Gas E&P,FANG,0.884634,1,1
1250,Oil & Gas E&P,CTRA,0.825482,1,1
2593,Diagnostics & Research,LH,0.818853,1,0
3542,REIT - Industrial,PSA,0.763117,1,1


roc_auc_score: 
 0.4375
classification report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.80      1.00      0.89         8

    accuracy                           0.80        10
   macro avg       0.40      0.50      0.44        10
weighted avg       0.64      0.80      0.71        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
