In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification

from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

In [4]:
file_path = Path('data.csv')
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,date,totalArticles,daily_return,volume_change,spread,joy,fear,sadness,tentative,analytical,confident,anger,crossover_signal,vol_trend_signal,bollinger_signal,close,positive_return
0,2020-10-31,5.0,0.007717,0.165422,0.041145,0.0,0.0,0.0,0.668336,0.0,0.0,0.0,1.0,-1.0,-1.0,13567.0,1.0
1,2020-11-01,1.0,0.017845,-0.183336,0.048242,0.0,0.0,0.576203,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,13809.1,0.0
2,2020-11-02,1.0,-0.003375,-0.564168,0.019277,0.0,0.0,0.0,0.672523,0.0,0.0,0.0,1.0,1.0,-1.0,13762.5,0.0
3,2020-11-03,2.0,-0.014292,1.237562,0.045173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,13565.8,1.0
4,2020-11-04,1.0,0.033614,-0.021746,0.055851,0.0,0.0,0.0,0.0,0.822162,0.0,0.0,1.0,-1.0,-1.0,14021.8,1.0


In [5]:
data = data.set_index("date")
data.head()

Unnamed: 0_level_0,totalArticles,daily_return,volume_change,spread,joy,fear,sadness,tentative,analytical,confident,anger,crossover_signal,vol_trend_signal,bollinger_signal,close,positive_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-10-31,5.0,0.007717,0.165422,0.041145,0.0,0.0,0.0,0.668336,0.0,0.0,0.0,1.0,-1.0,-1.0,13567.0,1.0
2020-11-01,1.0,0.017845,-0.183336,0.048242,0.0,0.0,0.576203,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,13809.1,0.0
2020-11-02,1.0,-0.003375,-0.564168,0.019277,0.0,0.0,0.0,0.672523,0.0,0.0,0.0,1.0,1.0,-1.0,13762.5,0.0
2020-11-03,2.0,-0.014292,1.237562,0.045173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,13565.8,1.0
2020-11-04,1.0,0.033614,-0.021746,0.055851,0.0,0.0,0.0,0.0,0.822162,0.0,0.0,1.0,-1.0,-1.0,14021.8,1.0


In [7]:
X = data.drop(columns= "positive_return")

# target
y = data["positive_return"]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                   y, 
                                                random_state=1, 
                                                   stratify=y) 
                                                   

In [9]:
scaler = StandardScaler()

In [10]:
X_scaler = scaler.fit(X_train)

In [11]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
brf = BalancedRandomForestClassifier(n_estimators=100,random_state=1)
brf_model = brf.fit(X_train_scaled,y_train)

In [13]:
predictions_brf = brf_model.predict(X_test_scaled)
acc_score_brf = accuracy_score(y_test, predictions_brf)
print(f"Accuracy Score : {acc_score_brf *100}%")

Accuracy Score : 60.86956521739131%


In [14]:
cm_brf = confusion_matrix(y_test, predictions_brf)
cm_df_brf = pd.DataFrame(
    cm_brf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix -->")
display(cm_df_brf)

Confusion Matrix -->


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4,5
Actual 1,4,10


In [15]:
print("Classification Report -->")
print(classification_report(y_test, predictions_brf))

Classification Report -->
              precision    recall  f1-score   support

         0.0       0.50      0.44      0.47         9
         1.0       0.67      0.71      0.69        14

    accuracy                           0.61        23
   macro avg       0.58      0.58      0.58        23
weighted avg       0.60      0.61      0.60        23



In [16]:
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.14854834805611974, 'volume_change'),
 (0.1398797669383557, 'daily_return'),
 (0.1386346838803821, 'totalArticles'),
 (0.13827528578993875, 'spread'),
 (0.12024555909406914, 'close'),
 (0.08264820222051127, 'tentative'),
 (0.07739767439769599, 'joy'),
 (0.06010070361345401, 'sadness'),
 (0.043338356905703306, 'analytical'),
 (0.02499361161231822, 'bollinger_signal'),
 (0.018192923716115257, 'vol_trend_signal'),
 (0.007744883775336504, 'fear'),
 (0.0, 'crossover_signal'),
 (0.0, 'confident'),
 (0.0, 'anger')]