# Predicting OSEBX returns using Random Forest


### Overview


In [26]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
print(sklearn.__version__)

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn import metrics

0.23.2


### Indices

In [73]:
indices = pd.read_excel('MASTER_A.xlsx',
                        sheet_name = 'Indices2',
                        usecols=['Date', 'OSEBX', 'S&P 500', 'NYSE', 'FTSE 100',
                                 'MSCI CANADA','MSCI AUSTRALIA', 'CAC 40',
                                 'HANG SENG', 'TOPIX',
                                 ],
                        index_col = 'Date')
indices = indices[1:]
indices_returns = indices.pct_change()[1:]
indices_returns.index = indices_returns.index.astype(str) 
indices_returns.head()

Unnamed: 0_level_0,OSEBX,S&P 500,NYSE,FTSE 100,MSCI CANADA,MSCI AUSTRALIA,CAC 40,HANG SENG,TOPIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-03-02,0.001704,-0.003205,-0.006075,-0.003392,-0.00705,0.003848,0.000411,0.008203,0.007553
2012-03-05,-0.013377,-0.003813,-0.004118,-0.006144,-0.009166,-0.000987,-0.003894,-0.013772,-0.005917
2012-03-06,-0.029738,-0.015341,-0.02112,-0.018556,-0.018658,-0.013369,-0.035836,-0.021587,-0.006616
2012-03-07,0.01244,0.007205,0.007798,0.00637,0.002698,-0.013842,0.008853,-0.008578,-0.005588
2012-03-08,0.018467,0.009915,0.012935,0.011797,0.009545,0.006234,0.025361,0.013232,0.016347


### Factors

In [74]:
factor_returns = pd.read_excel('FACTOR_RETURNS.xlsx',
                               index_col='Date')
factor_returns.index = factor_returns.index.astype(str)
factor_returns = factor_returns[['WTI Crude Oil', 'Aluminium',
                                 'Natural Gas - RFV', '10Y NO',
                                 '10Y US', '10Y UK']]
factor_returns.head()

Unnamed: 0_level_0,WTI Crude Oil,Aluminium,Natural Gas - RFV,10Y NO,10Y US,10Y UK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-03-02,-0.018941,-0.011461,0.025806,0.0425,-0.4335,-0.5525
2012-03-05,0.000187,-0.018375,0.002096,-0.051,0.238,0.2125
2012-03-06,-0.018928,-0.023064,-0.014644,-0.0255,-0.4845,-0.3825
2012-03-07,0.013945,-0.012203,0.006369,-0.017,0.187,-0.0085
2012-03-08,0.003956,-0.001386,0.021097,0.272,0.374,0.119


### Concatenating

In [34]:
data = pd.concat([indices_returns, factor_returns], axis=1)
assert data.isnull().sum().sum() == 0, "nul values present in data"
data.reset_index(drop=True, inplace=True)
data = data.astype(float)
data.head()

Unnamed: 0,OSEBX,S&P 500,NYSE,FTSE 100,MSCI CANADA,MSCI AUSTRALIA,CAC 40,HANG SENG,TOPIX,WTI Crude Oil,Aluminium,Natural Gas - RFV,10Y NO,10Y US,10Y UK
0,0.001704,-0.003205,-0.006075,-0.003392,-0.00705,0.003848,0.000411,0.008203,0.007553,-0.018941,-0.011461,0.025806,0.0425,-0.4335,-0.5525
1,-0.013377,-0.003813,-0.004118,-0.006144,-0.009166,-0.000987,-0.003894,-0.013772,-0.005917,0.000187,-0.018375,0.002096,-0.051,0.238,0.2125
2,-0.029738,-0.015341,-0.02112,-0.018556,-0.018658,-0.013369,-0.035836,-0.021587,-0.006616,-0.018928,-0.023064,-0.014644,-0.0255,-0.4845,-0.3825
3,0.01244,0.007205,0.007798,0.00637,0.002698,-0.013842,0.008853,-0.008578,-0.005588,0.013945,-0.012203,0.006369,-0.017,0.187,-0.0085
4,0.018467,0.009915,0.012935,0.011797,0.009545,0.006234,0.025361,0.013232,0.016347,0.003956,-0.001386,0.021097,0.272,0.374,0.119


### Preparing data

In [46]:
X = data.iloc[:, 1:]
y = data['OSEBX']

y = y.map(lambda x : 'Up' if x > 0 else 'down') # categoriacal y

print(X.shape)
print(y.shape)

(2608, 14)
(2608,)


In [47]:
y

0         Up
1       down
2       down
3         Up
4         Up
        ... 
2603      Up
2604    down
2605      Up
2606      Up
2607      Up
Name: OSEBX, Length: 2608, dtype: object

#### Splitting

In [63]:
train_split = int(0.7 * int(data.shape[0]))
print(f'train split {train_split}')

X_train = X.iloc[:train_split]
y_train = y.iloc[:train_split]

X_test = X.iloc[:train_split][:5]
y_test = y.iloc[:train_split][:5]

X_train_sc = (X_train - X_train.mean()) / X_train.std()
X_test_sc = (X_test - X_train.mean()) / X_train.std()

train split 1825


In [64]:
### Checking dimensions -> appropriate shape

print(f'Shape of X_train scaled: {X_train_sc.shape}')
print(f'Shape of X_test scaled: {X_test_sc.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')


Shape of X_train scaled: (1825, 14)
Shape of X_test scaled: (5, 14)
Shape of y_train: (1825,)
Shape of y_test: (5,)


### Assembling the model - fitting

In [65]:
y_train

0         Up
1       down
2       down
3         Up
4         Up
        ... 
1820      Up
1821      Up
1822    down
1823    down
1824    down
Name: OSEBX, Length: 1825, dtype: object

In [66]:
model = RandomForestClassifier(
  n_estimators = 200,
  bootstrap    = True,
  max_features = 'sqrt'
)

model.fit(X_train_sc, y_train)
    
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Predictions

In [67]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [68]:
rf_pred = model.predict(X_test)
rf_prob = model.predict_proba(X_test)[:, 1]

# roc_value_pred = roc_auc_score(y_test, rf_pred)
# roc_value_prob = roc_auc_score(y_test, rf_prob)

# print(f'Accuracy Score: {accuracy_score(y_test, rf_pred):>5.2f}')
# print(f'ROC-value: {roc_value_pred:>12.2%}')
# print(f'MAE is : {metrics.mean_absolute_error(y_test, rf_pred):>13.3f}')
# print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_test, rf_pred)):>12.3f}')
# print(f'R2 is : {metrics.r2_score(y_test, rf_pred):>14.3f}',end='\n\n')
rf_pred

array(['down', 'down', 'down', 'down', 'down'], dtype=object)

In [72]:
model.predict(X_test)

array(['down', 'down', 'down', 'down', 'down'], dtype=object)

In [71]:
y_test

0      Up
1    down
2    down
3      Up
4      Up
Name: OSEBX, dtype: object

#### Confusion matrix

In [637]:
confusion_matrix(y_test, rf_pred)

array([[558,   4],
       [101,  12]], dtype=int64)

#### View the classification report for test data and predictions

In [638]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91       562
           1       0.75      0.11      0.19       113

    accuracy                           0.84       675
   macro avg       0.80      0.55      0.55       675
weighted avg       0.83      0.84      0.79       675



### Feature Importance

In [619]:
cm = sns.light_palette("blue", as_cmap=True)

FI = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)\
                    .style.format({'importance': '{:.1%}'})\
                    .hide_index()\
                    .background_gradient()
FI

feature,importance
z_score,16.8%
Crude Oil WTI,11.6%
forex,11.4%
LME - Aluminium,11.0%
Baltic Dry (BDI),10.9%
Gold,10.1%
Brent Spot Europe,9.8%
Crude Oil Europe,9.3%
RFV Natural Gas1,9.0%


In [621]:
rf_prob.max()

0.845

## NOTES
* Hard to predict "Good Points" with a simple Random Forest model
    * The confusion matrix displays this well
* Lundin & Equinor most promising pair so far.
* Excluding the z-score does not give better results than 50% (equals random guessing)

Is there really any valid reason for developing the "Good points", that corresponds to the goal goal of the thesis?
* Foreigners on Oslo Børs