# Evaluation

In [171]:
# imports

from src.utils import utils
from src.modelling import training as train
import configparser
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [172]:
proj_root = utils.get_proj_root()

config = configparser.ConfigParser(interpolation=None)
config.read(proj_root.joinpath('config/data_config.ini'))

preprocessed_data_rel_path = config['data_paths']['preprocessed_data_path']
preprocessed_data_path=  proj_root.joinpath(preprocessed_data_rel_path)

preprocessed_data = train.get_training_data(file_path=preprocessed_data_path)


model_output_dir = proj_root.joinpath(config['modelling_paths']['model_output'])
model_name = 'xgboost'
# model_name = 'random_forest'
model_output_path = model_output_dir.joinpath(model_name+'.pkl')
model = utils.load_value(model_output_path)
print(model)

Pipeline(steps=[('select_optimal_cols',
                 OptimalColumnSelector(optimal_cols_path=PosixPath('/home/aroge/projects/dividend-cut-predictor/models/artifacts/optimal_features.pkl'))),
                ('cat_to_ordinal_cols', ColumnsOrdinalEncoder()),
                ['model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytr...
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.026692793437760223, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=50,
                               max_leaves=None, min_child_weight=6, missing=nan,
                               monotone_constraints=None, multi_strat

## overall model evaluation

In [173]:
inf_year = int(config["year_limits"]["inf_year"])
final_year = inf_year - 2
label_col_name = 'dps_change_next_year'
training_data, testing_data = train.train_test_split(df=preprocessed_data, final_year=final_year)
X_test, y_test = train.split_Xy(testing_data, label_col_name=label_col_name)

y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
score = roc_auc_score(y_true=y_test, y_score=y_pred_prob)

print(score)
print(classification_report(y_true=y_test, y_pred=y_pred))

0.8070872947277442
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       445
           1       0.75      0.12      0.20        26

    accuracy                           0.95       471
   macro avg       0.85      0.56      0.59       471
weighted avg       0.94      0.95      0.93       471



## Check Top Predictions

In [175]:
def get_top_n_idx(a, n=20, order='desc'):

    if order == 'asc':

        idx_top_n = np.argpartition(a, n)[:n]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])]
    else:
        idx_top_n = np.argpartition(a, -n)[-n:]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])][::-1]

    return idx_top_n

n = 10
top_n_pred_idxs = get_top_n_idx(y_pred_prob, n=n, order='desc')

y_pred_prob_top_n = y_pred_prob[top_n_pred_idxs]
y_pred_top_n = y_pred[top_n_pred_idxs]
y_true = y_test.values[top_n_pred_idxs]


print(f'top {n} probabilities of cutting dividends for {final_year+1}:')
df = X_test.iloc[top_n_pred_idxs, [1,2]]
df['pred. prob'] = y_pred_prob_top_n
df['pred. class'] = y_pred_top_n
df['true class'] = y_true
display(df)
print('roc_auc_score: \n', roc_auc_score(y_true=y_true, y_score=y_pred_prob_top_n))
print(f'classification report: \n',classification_report(y_true=y_true, y_pred=y_pred_top_n))


top 10 probabilities of cutting dividends for 2023:


Unnamed: 0,industry,symbol,pred. prob,pred. class,true class
3565,REIT - Industrial,PSA,0.72728,1,1
2715,Specialty Chemicals,LYB,0.599974,1,1
4153,Aerospace & Defense,TDG,0.575821,1,0
1372,Oil & Gas E&P,FANG,0.527459,1,1
1357,Oil & Gas E&P,DVN,0.484438,0,1
1170,Oil & Gas E&P,COP,0.477948,0,1
3444,Oil & Gas E&P,PXD,0.468074,0,1
1249,Oil & Gas E&P,CTRA,0.428168,0,1
450,Telecom Services,T,0.350418,0,1
3484,Utilities - Regulated Electric,PPL,0.32086,0,0


roc_auc_score: 
 0.625
classification report: 
               precision    recall  f1-score   support

           0       0.17      0.50      0.25         2
           1       0.75      0.38      0.50         8

    accuracy                           0.40        10
   macro avg       0.46      0.44      0.38        10
weighted avg       0.63      0.40      0.45        10

