In [59]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,balanced_accuracy_score
from sklearn.metrics import roc_auc_score, log_loss

import pandas as pd
import numpy as np

from IPython.core.display import HTML


HTML("""
<style>
.container { width:100% !important; }
</style>
""")


### Drawbacks of Decision Trees
* Too small => Underfitting/high bias
* To large  => Overfitting/high variance.

### Some background info

##### Bootstrap Technique (Efron et al)

* Estimation Tool: Uses sampling with replacement to estimate population statistics (e.g., mean, variance) from a sample.
* Assumptions: Assumes data are independent and identically distributed (IID), the sample is representative, and sufficiently large.
* Purpose: Measures the variability of a statistic across multiple resamples to estimate its accuracy.

##### Bagging (Bootstrap Aggregating)
* Variance Reduction: An ensemble method that trains multiple models on bootstrap samples to reduce prediction variance, particularly effective for high-variance, overfit-prone models like decision trees.
* Assumptions: Assumes base models have high variance and perform better than random guessing. Aims to reduce overfitting by averaging model predictions.
* Outcome: Produces a more robust model by combining the strengths of multiple learners, leading to improved accuracy and stability.

Both bootstrap and bagging leverage resampling to achieve their goals—bootstrap for statistical estimation and bagging for enhancing model performance by reducing variance.








### Random Forest (By Breiman et Al)
* For dataset with $N$ examples and $F$ features

* For $i \in \text{NUM_TREES}$ 
    - Bootstrap Sample: Sample with replacement $N$ examples.
    - Random Subspace Method: Sample without replacement a set of $f<F$ features.
    - Fit Decision Tree on the sample on bootstrap sample using the $f$ selected features
* Take an average of the predictions for regression trees or probabilities for classifiaction trees. 
* For classification trees if we make classifications based on a "threshold" if threshold is $\frac{1}{\text{NUM_CLASSES}}$ then this is majority vote.

##### Random Forest Benfits:

* Lower Variance of Individual Trees => Lower Error do to Bias-Variance Decomposition
* Absuredely Parallel since trees are independent (in sckit-learn n_jobs=-1)

##### Resources

###### Books/Papers
* The Elements of Statistical Learning: Jerome H. Friedman, Robert Tibshirani, and Trevor Hastie


###### Sckit-Learn
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://scikit-learn.org/stable/modules/classes.html
    

### Walk through!

In [60]:
NUM_TREES = 10
N = 10
F = 10
f = 3
for i in range(1,NUM_TREES+1):
    bootstrap_sample = np.random.choice(N,N,replace=True).tolist()
    random_features = np.random.choice(F,f,replace=False)
    print("Fit tree","{:2}".format(i),"with using training examples",bootstrap_sample,"using features",random_features)
print("* Average the tree's prediction to get final predictions")
print("    - For classification you average to probabilities and can make a classification based on the max")

Fit tree  1 with using training examples [5, 2, 8, 2, 5, 2, 1, 0, 0, 0] using features [7 9 6]
Fit tree  2 with using training examples [0, 6, 7, 5, 3, 4, 4, 7, 3, 3] using features [3 6 5]
Fit tree  3 with using training examples [6, 5, 2, 4, 5, 7, 2, 6, 5, 3] using features [6 3 8]
Fit tree  4 with using training examples [3, 5, 2, 2, 3, 3, 5, 4, 0, 7] using features [0 2 8]
Fit tree  5 with using training examples [2, 9, 7, 8, 0, 2, 5, 7, 7, 3] using features [4 1 2]
Fit tree  6 with using training examples [3, 8, 2, 5, 3, 0, 2, 2, 0, 5] using features [4 7 1]
Fit tree  7 with using training examples [6, 2, 9, 4, 5, 4, 1, 8, 5, 1] using features [6 7 4]
Fit tree  8 with using training examples [5, 2, 0, 0, 8, 8, 5, 2, 7, 7] using features [1 5 6]
Fit tree  9 with using training examples [6, 0, 7, 3, 1, 2, 2, 6, 9, 3] using features [4 3 5]
Fit tree 10 with using training examples [8, 8, 2, 1, 6, 7, 2, 2, 8, 6] using features [0 9 2]
* Average the tree's prediction to get final predi

* Notice not all training examples are used in every tree!

### For each example, the % of trees in which it is selected a $N \rightarrow \infty$ and $\text{NUM_TREEES}\rightarrow\infty$ goes to 
$$1-\frac{1}{e}$$ since the probability of not selecting example $i$ in a particular bootstrap sample is  is $\left(\frac{N-1}{N}\right)^N=\left(1-\frac{1}{N}\right)^N$.

* The limit of the above as $N\rightarrow \infty$ is $e$ by definition


In [61]:
NUM_TREES = 10000
N = NUM_TREES
cnt=np.zeros(N)
for i in range(NUM_TREES):
    bootstrap_sample = np.random.choice(N,N,replace=True).tolist()
    cnt[list(set(bootstrap_sample))]+=1
cnt/NUM_TREES

array([0.6324, 0.6363, 0.631 , ..., 0.6299, 0.6282, 0.627 ])

In [62]:
np.mean(np.abs((cnt/NUM_TREES-(1-1/np.exp(1)))/(1-1/np.exp(1))  ))

0.006070277658665454

### Out of Bag (OOB Predictions)
* Predictions made on the unselected examples

In [63]:
NUM_TREES = 10
N = NUM_TREES
F=10
f=3
for i in range(1,NUM_TREES+1):
    bootstrap_sample = np.random.choice(N,N).tolist()
    random_features = np.random.choice(F,f,replace=False)
    print("* Fit tree","{:2}".format(i),"with using training examples",bootstrap_sample,"using features",random_features)
    examples_not_selected = [i for i in range(N) if i not in bootstrap_sample]
    print("  Out of Bag Predictions using training examples",examples_not_selected)
    print()

* Fit tree  1 with using training examples [3, 6, 3, 5, 5, 8, 0, 0, 0, 7] using features [2 7 1]
  Out of Bag Predictions using training examples [1, 2, 4, 9]

* Fit tree  2 with using training examples [1, 2, 6, 2, 8, 0, 0, 2, 6, 6] using features [9 0 8]
  Out of Bag Predictions using training examples [3, 4, 5, 7, 9]

* Fit tree  3 with using training examples [8, 1, 2, 9, 2, 6, 0, 2, 6, 8] using features [4 9 0]
  Out of Bag Predictions using training examples [3, 4, 5, 7]

* Fit tree  4 with using training examples [9, 3, 2, 4, 1, 1, 2, 2, 1, 4] using features [1 7 3]
  Out of Bag Predictions using training examples [0, 5, 6, 7, 8]

* Fit tree  5 with using training examples [6, 7, 2, 0, 4, 6, 0, 4, 4, 3] using features [6 5 9]
  Out of Bag Predictions using training examples [1, 5, 8, 9]

* Fit tree  6 with using training examples [9, 1, 0, 2, 3, 6, 8, 5, 8, 6] using features [4 9 2]
  Out of Bag Predictions using training examples [4, 7]

* Fit tree  7 with using training exampl

### Clearly a huge problem with "correlated/overlapping" labels. 
* The tripple barrier labels are overlapping
* We must have a better way to deal with them since it screws up the effectiveness of the boostrap. We will talk about this in a future video.

### Sckit-Learn Classification Example
* Regression is pretty much the splitter/criterion + predict instead of predict_proba

### IMPORTANT PARAMS
* class_weight: dict, balanced, or balanced_subsample
* n_estimators
* criterion
* max_features
* max_depth
* min_samples_leaf
* min_samples_split
* min_weight_fraction_leaf
* min_weight_fraction_split
* max_samples (IMPORTANT FOR OVERLAP!)
* oob_score
* ramdom state
* n_jobs

### Load cusum filtered dataset with tripple barrier labels + sample weights from previous video


In [64]:
df = pd.read_csv("sample_data/tsla_dollar_bars_with_features_and_tripple_barrier_labels.csv",index_col=0,parse_dates=[0])

#### Split to train/test

In [65]:
train=df.loc[:'2022-12-31']
test=df.loc['2023-01-02':]

### Define RF for Classification

In [66]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=500,max_depth=9,class_weight='balanced_subsample',oob_score=True,max_features=0.1,min_samples_split=10)

###### When fitting the model you can specify sample_weight 

In [67]:
feature_cols=['RSI_14',
 'RSI_840',
 'RSI_70',
 'PPO_12_26_9',
 'PPO_40_200_12',
 'PPOh_40_200_12',
 'PPOs_40_200_12',
 'LOGRET_840',
 'LOGRET_1',
 'LOGRET_2',
 'LOGRET_3',
 'AROOND_14',
 'AROONU_14',
 'AROONOSC_14',
 'BOP',
 'MFI_14',
 'average_to_close',
 'serial_correlation_50_1']

#### Fit the model

In [68]:
clf.fit(train[feature_cols],train['label'],sample_weight=train['SAMPLE_WEIGHT'])

### Can get individual estimators

In [69]:
clf.estimators_[10]

### Get Params

In [70]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_depth': 9,
 'max_features': 0.1,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': True,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Can get feature importance

In [71]:
clf.feature_importances_

array([0.05593777, 0.07000343, 0.08823762, 0.05938594, 0.10309632,
       0.06477602, 0.09694173, 0.06280026, 0.0392871 , 0.05318268,
       0.05595548, 0.0228365 , 0.02545203, 0.03363586, 0.03909908,
       0.04561419, 0.0428354 , 0.04092259])

In [72]:
pd.DataFrame({'Name':clf.feature_names_in_,'Score':clf.feature_importances_}).sort_values('Score',ascending=False)

Unnamed: 0,Name,Score
4,PPO_40_200_12,0.103096
6,PPOs_40_200_12,0.096942
2,RSI_70,0.088238
1,RSI_840,0.070003
5,PPOh_40_200_12,0.064776
7,LOGRET_840,0.0628
3,PPO_12_26_9,0.059386
10,LOGRET_3,0.055955
0,RSI_14,0.055938
9,LOGRET_2,0.053183


### Can get OOB predictions
* Need o set oob_score = True

In [73]:
clf.oob_score_

0.5849353923731484

In [74]:
oob_probs=clf.oob_decision_function_[:,1]
oob_preds = np.where(oob_probs>=0.5,1,-1)

In [75]:
confusion_matrix(train['label'],oob_preds)

array([[3791, 1088],
       [2863, 1777]], dtype=int64)

In [76]:
log_loss(train['label'],oob_probs)

0.6703609066143639

In [77]:
roc_auc_score(train['label'],oob_preds)

0.579988811125796

###### TOO OPTIMISTIC DO TO OVERLAPPING LABELS ! USELESS IN THIS CASE!

#### Let's look at results on test set 

In [78]:
tests_preds = clf.predict(test[feature_cols])
tests_preds

array([ 1.,  1.,  1., ..., -1., -1., -1.])

In [79]:
test_probs = clf.predict_proba(test[feature_cols])[:,1]
test_probs

array([0.52265695, 0.55942637, 0.51246789, ..., 0.4407655 , 0.46101783,
       0.47021667])

In [80]:
accuracy_score(test['label'],tests_preds)

0.48661899897854954

In [81]:
confusion_matrix(test['label'],tests_preds)

array([[1594,  703],
       [1810,  788]], dtype=int64)

In [82]:
log_loss(test['label'],test_probs)

0.7037969051546367

In [83]:
roc_auc_score(test['label'],test_probs)

0.4938978880308117

#### Test set accuracy and confusion matrix way lower...

### Using "BaggingClassifier" Class
* need to define the base_estimator

In [84]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
 
base_estimator = DecisionTreeClassifier(max_depth=None, random_state=42, splitter='random')

# Create a BaggingClassifier with the decision tree as the base estimator
random_forest_via_bagging = BaggingClassifier(
    base_estimator=base_estimator,
    n_estimators=100, # Number of trees
    max_samples=0.8, # Fraction of samples to train each base estimator
    max_features=0.8, # Fraction of features to draw from the total features for training each base estimator
    bootstrap=True, # Samples are drawn with replacement (setting to False would be for Pasting)
    bootstrap_features=False, # Features are not drawn with replacement
    random_state=42
)


##### Can do Regression with RandomForestRegressor and BaggingRegressor for example to predict Simple Retuns $\frac{S_{t+1}-S_t}{S_t}$

In [85]:
df = pd.read_csv('sample_data/tsla_returns_prediction',index_col=0,parse_dates=['date'])
 
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,LABEL,PREV_RETURN,RSI_14,RSI_840,RSI_70,...,PVR,PVT,CHOP_14_1_100,RSX_14,RVGI_14_4,RVGIs_14_4,UO_7_14_28,TSI_13_25_13,TSIs_13_25_13,serial_correlation_50_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-03 12:55:00,286.4500,286.8633,285.8833,286.1267,459699.0,0.000769,-0.001443,38.589439,51.500601,49.275659,...,4.0,8.803861e+07,46.669668,43.677265,-0.034304,0.104102,29.863009,-3.577866,-2.195433,-0.020455
2021-02-03 13:00:00,286.1133,286.9533,286.1033,286.3467,255252.0,0.000012,0.000769,40.865114,51.518911,49.560990,...,2.0,8.805823e+07,45.906915,36.725154,-0.128016,0.019149,28.282276,-5.574056,-2.678093,-0.017953
2021-02-03 13:05:00,286.3000,286.6233,286.0333,286.3500,132573.0,0.000408,0.000012,40.900491,51.519186,49.565307,...,2.0,8.805838e+07,44.408598,31.507992,-0.184191,-0.071943,30.175838,-7.219708,-3.326895,-0.015336
2021-02-03 13:10:00,286.3633,286.7033,286.3433,286.4667,115692.0,-0.000477,0.000407,42.217004,51.528917,49.719704,...,2.0,8.806310e+07,42.641712,28.070207,-0.213231,-0.145325,29.110498,-8.288995,-4.035767,-0.018006
2021-02-03 13:15:00,286.5000,286.7267,286.0033,286.3300,135681.0,-0.000570,-0.000477,41.063086,51.516790,49.539482,...,3.0,8.805662e+07,43.298426,25.518553,-0.226796,-0.191609,32.049469,-9.509211,-4.817687,-0.031670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-22 15:30:00,233.0800,233.4300,233.0100,233.3100,691981.0,0.000600,0.000986,51.925258,50.569267,46.142918,...,1.0,9.422860e+08,54.969588,49.854528,-0.073070,-0.115703,60.141164,-4.119316,-5.710006,-0.179047
2023-11-22 15:35:00,233.3200,233.4900,233.2100,233.4500,600678.0,0.001028,0.000600,53.352797,50.586787,46.412396,...,2.0,9.423221e+08,53.069949,52.551986,-0.025758,-0.088811,59.429339,-2.570393,-5.261490,-0.181043
2023-11-22 15:40:00,233.4600,233.7000,233.2700,233.6900,973646.0,0.001455,0.001028,55.777095,50.616828,46.874686,...,1.0,9.424222e+08,48.674596,55.741677,0.025860,-0.047715,65.324600,-0.558022,-4.589566,-0.177164
2023-11-22 15:45:00,233.6900,234.1500,233.5800,234.0300,994516.0,-0.000342,0.001454,59.025889,50.659374,47.525290,...,1.0,9.425669e+08,39.471512,59.718936,0.105390,0.005421,71.625497,2.135027,-3.628910,-0.171414


##### Define Feature Columns

In [86]:
feature_cols=['RSI_14',
 'RSI_840',
 'RSI_70',
 'PPO_12_26_9',
 'PPO_40_200_12',
 'PPOh_40_200_12',
 'PPOs_40_200_12',
 'LOGRET_840',
 'LOGRET_1',
 'LOGRET_2',
 'LOGRET_3',
 'AROOND_14',
 'AROONU_14',
 'AROONOSC_14',
 'BOP',
 'MFI_14',
 'serial_correlation_50_1']

##### Split into training/test

In [87]:
train=df.loc[:'2022-12-31']
test=df.loc['2023-01-02':]

In [88]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=100,max_depth=5,criterion='absolute_error',oob_score=True,n_jobs=-1,max_features=0.1)
reg = reg.fit(train[feature_cols], train['LABEL'])
