# Evaluation

In [74]:
# imports

from src.utils import utils
from src.modelling import training as train
import configparser
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
proj_root = utils.get_proj_root()

config = configparser.ConfigParser(interpolation=None)
config.read(proj_root.joinpath('config/data_config.ini'))

preprocessed_data_rel_path = config['data_paths']['preprocessed_data_path']
preprocessed_data_path=  proj_root.joinpath(preprocessed_data_rel_path)

preprocessed_data = train.get_training_data(file_path=preprocessed_data_path)


model_output_dir = proj_root.joinpath(config['modelling_paths']['model_output'])
model_name = 'xgboost'
model_output_path = model_output_dir.joinpath(model_name+'.pkl')
model = utils.load_value(model_output_path)
print(model)

Pipeline(steps=[('cat_to_ordinal_cols',
                 ColumnsOrdinalEncoder(col_names=['industry', 'symbol'])),
                ('select_optimal_cols',
                 OptimalColumnSelector(optimal_cols_path=PosixPath('/home/aroge/projects/dividend-cut-predictor/models/artifacts/optimal_features.pkl'))),
                ['model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsa...
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.05845944217474756, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=48,
                               max_leaves=None, min_child_weight=1, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
     

In [79]:
final_year = int(config['year_limits']['end_year'])
label_col_name = 'dps_change_next_year'
training_data, testing_data = train.train_test_split(df=preprocessed_data, final_year=final_year)
X_test, y_test = train.split_Xy(testing_data, label_col_name=label_col_name)

y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
score = roc_auc_score(y_true=y_test, y_score=y_pred_prob)


print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       450
           1       0.64      0.35      0.45        26

    accuracy                           0.95       476
   macro avg       0.80      0.67      0.71       476
weighted avg       0.95      0.95      0.95       476



In [86]:
def get_top_n_idx(a, n=20, order='desc'):

    if order == 'asc':

        idx_top_n = np.argpartition(a, n)[:n]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])]
    else:
        idx_top_n = np.argpartition(a, -n)[-n:]
        idx_top_n = idx_top_n[np.argsort(a[idx_top_n])][::-1]

    return idx_top_n

n = 20
top_n_pred_idxs = get_top_n_idx(y_pred_prob, n=n, order='desc')

y_pred_prob_top_n = y_pred_prob[top_n_pred_idxs]
y_pred_top_n = y_pred[top_n_pred_idxs]
y_true = y_test.values[top_n_pred_idxs]

print(y_pred_prob_top_n) 
print(y_pred_top_n)
print(y_true)

print(roc_auc_score(y_true=y_true, y_score=y_pred_prob_top_n))


# print(classification_report(y_true=y_true, y_pred=y_pred_top_n))
print(classification_report(y_true=y_true, y_pred=y_pred_top_n))


[0.9656353  0.9648743  0.9242086  0.91976327 0.91603214 0.9047447
 0.88463426 0.82548195 0.8188528  0.76311654 0.7497086  0.7311009
 0.640243   0.55834967 0.49577546 0.48457736 0.45673865 0.43656778
 0.3966486  0.3906465 ]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0]
0.8400000000000001
              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.64      0.90      0.75        10

    accuracy                           0.70        20
   macro avg       0.74      0.70      0.69        20
weighted avg       0.74      0.70      0.69        20



In [84]:
X_test.reset_index(drop=True, inplace=True)

for idx in top_n_pred_idxs:
    print(X_test.iloc[idx, [1,2]].values)

['Aerospace & Defense' 'TDG']
['Oil & Gas E&P' 'PXD']
['Oil & Gas E&P' 'COP']
['Oil & Gas E&P' 'EOG']
['Oil & Gas E&P' 'DVN']
['Specialty Chemicals' 'LYB']
['Oil & Gas E&P' 'FANG']
['Oil & Gas E&P' 'CTRA']
['Diagnostics & Research' 'LH']
['REIT - Industrial' 'PSA']
['Asset Management' 'BX']
['Luxury Goods' 'TPR']
['Packaging & Containers' 'IP']
['Farm & Heavy Construction Machinery' 'PCAR']
['Capital Markets' 'RJF']
['Capital Markets' 'GS']
['Beverages - Brewers' 'TAP']
['Financial Data & Stock Exchanges' 'CME']
['REIT - Residential' 'ESS']
['Oil & Gas Midstream' 'TRGP']


In [64]:
X_test.reset_index(drop=True, inplace=True)

for idx in top_n_pred_idxs:
    print(X_test.iloc[idx, [1,2]].values)

['Insurance—Property & Casualty' 'PGR']
['Beverages—Wineries & Distilleries' 'BF-B']
['Restaurants' 'DRI']
['Oil & Gas E&P' 'PXD']
['Credit Services' 'COF']
['Oil & Gas E&P' 'EOG']
['Asset Management' 'TROW']
['Gold' 'NEM']
['REIT—Retail' 'SPG']
['Healthcare Plans' 'CI']


In [65]:
preprocessed_data = train.get_training_data(file_path=preprocessed_data_path)
final_year = int(config['year_limits']['end_year'])

training_data, testing_data = train.train_test_split(df=preprocessed_data, final_year=final_year)


from src.modelling import transforms
categorical_features = ['industry', 'symbol']
ord_col_transf = transforms.ColumnsOrdinalEncoder(col_names=categorical_features)
ord_col_transf.fit(training_data)
training_data_trans = ord_col_transf.fit_transform(training_data)
training_data_inv = ord_col_transf.inverse_transform(training_data_trans)

training_data.head()

Unnamed: 0,year,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,2012,Conglomerates,MMM,2.198387,-2.275946,1.385806,-0.090461,0.465,14.018251,49.567449,...,-5.500411,8.692139,5.900833,3.571431,-2.168468,0.14,40.0,2.36,7.272727,0
1,2013,Conglomerates,MMM,1.698186,-22.753086,1.01227,-26.95445,0.344225,-25.973091,50.284895,...,-28.717392,12.326051,41.806877,5.328531,49.198768,0.11,-21.428571,2.54,7.627119,0
2,2014,Conglomerates,MMM,1.961487,15.504836,1.127209,11.354593,0.316272,-8.120568,48.611609,...,14.836058,13.065263,5.997151,8.117223,52.335098,0.09,-18.181818,3.42,34.645669,0
3,2015,Conglomerates,MMM,1.543411,-21.314241,0.852768,-24.346986,0.252599,-20.132363,50.082909,...,30.819267,12.318266,-5.717433,8.022507,-1.166858,0.13,44.444444,4.1,19.883041,0
4,2016,Conglomerates,MMM,1.885512,22.16526,1.136839,33.311664,0.385593,52.650039,53.242552,...,-8.738086,13.479738,9.428861,10.440035,30.134325,0.4,207.692308,4.44,8.292683,0


In [50]:
import pandas as pd
def detect_categorical_columns(df, threshold=0.5):
    categorical_columns = []

    for column in df.columns:
        # Filter columns based on data types
        if df[column].dtype == 'object' or pd.api.types.is_categorical_dtype(df[column]):
            # Check if the unique values ratio is below the threshold
            unique_ratio = df[column].nunique() / len(df[column])
            if unique_ratio < threshold:
                categorical_columns.append(column)

    return categorical_columns

detect_categorical_columns(training_data)

  if df[column].dtype == 'object' or pd.api.types.is_categorical_dtype(df[column]):


['industry', 'symbol']

In [73]:
import yfinance as yf

msft = yf.Ticker("MSFT")
msft.info

HTTPError: 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v6/finance/quoteSummary/MSFT?modules=financialData&modules=quoteType&modules=defaultKeyStatistics&modules=assetProfile&modules=summaryDetail&ssl=true

In [48]:
training_data.drop(labels='year', axis=1, errors='ignore')

Unnamed: 0,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,daysOfSalesOutstanding_percentage_change,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,Conglomerates,MMM,2.198387,-2.275946,1.385806,-0.090461,0.465000,14.018251,49.567449,3.987852,...,-5.500411,8.692139,5.900833,3.571431,-2.168468,0.14,40.000000,2.360,7.272727,0
1,Conglomerates,MMM,1.698186,-22.753086,1.012270,-26.954450,0.344225,-25.973091,50.284895,1.447414,...,-28.717392,12.326051,41.806877,5.328531,49.198768,0.11,-21.428571,2.540,7.627119,0
2,Conglomerates,MMM,1.961487,15.504836,1.127209,11.354593,0.316272,-8.120568,48.611609,-3.327613,...,14.836058,13.065263,5.997151,8.117223,52.335098,0.09,-18.181818,3.420,34.645669,0
3,Conglomerates,MMM,1.543411,-21.314241,0.852768,-24.346986,0.252599,-20.132363,50.082909,3.026645,...,30.819267,12.318266,-5.717433,8.022507,-1.166858,0.13,44.444444,4.100,19.883041,0
4,Conglomerates,MMM,1.885512,22.165260,1.136839,33.311664,0.385593,52.650039,53.242552,6.308824,...,-8.738086,13.479738,9.428861,10.440035,30.134325,0.40,207.692308,4.440,8.292683,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,Drug Manufacturers—Specialty & Generic,ZTS,3.034915,41.127507,1.468218,25.054856,0.650850,0.447550,68.176146,-5.013279,...,0.983753,20.402038,-22.551804,17.845073,-20.410212,0.40,207.692308,0.380,14.457831,0
4562,Drug Manufacturers—Specialty & Generic,ZTS,3.854662,27.010537,2.341865,59.503835,1.429616,119.653531,68.639533,0.679691,...,-17.616247,21.371653,4.752544,19.939939,11.739185,1.00,150.000000,0.420,10.526316,0
4563,Drug Manufacturers—Specialty & Generic,ZTS,3.596893,-6.687199,2.237939,-4.437713,1.309894,-8.374443,64.916738,-5.423689,...,0.754194,22.187030,3.815224,18.911308,-5.158646,1.83,83.000000,0.504,20.000000,0
4564,Drug Manufacturers—Specialty & Generic,ZTS,2.629014,-26.908738,1.671096,-25.328797,1.069767,-18.331736,63.321086,-2.457998,...,-15.617244,30.473334,37.347515,23.366520,23.558450,2.16,18.032787,0.656,30.158730,0


In [42]:
training_data_trans.head(3)

Unnamed: 0,year,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,2012,24.0,289.0,2.198387,-2.275946,1.385806,-0.090461,0.465,14.018251,49.567449,...,-5.500411,8.692139,5.900833,3.571431,-2.168468,0.14,40.0,2.36,7.272727,0
1,2013,24.0,289.0,1.698186,-22.753086,1.01227,-26.95445,0.344225,-25.973091,50.284895,...,-28.717392,12.326051,41.806877,5.328531,49.198768,0.11,-21.428571,2.54,7.627119,0
2,2014,24.0,289.0,1.961487,15.504836,1.127209,11.354593,0.316272,-8.120568,48.611609,...,14.836058,13.065263,5.997151,8.117223,52.335098,0.09,-18.181818,3.42,34.645669,0


In [43]:
training_data_inv.head(2)

Unnamed: 0,year,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,2012,Conglomerates,MMM,2.198387,-2.275946,1.385806,-0.090461,0.465,14.018251,49.567449,...,-5.500411,8.692139,5.900833,3.571431,-2.168468,0.14,40.0,2.36,7.272727,0
1,2013,Conglomerates,MMM,1.698186,-22.753086,1.01227,-26.95445,0.344225,-25.973091,50.284895,...,-28.717392,12.326051,41.806877,5.328531,49.198768,0.11,-21.428571,2.54,7.627119,0
