## Import Libraries

In [4]:
import numpy as np 
import pandas as pd

import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale, StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, mean_squared_error, r2_score, roc_auc_score, roc_curve

#import shap
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

from IPython.display import Image
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
%matplotlib inline
import torch
import os
import matplotlib.pyplot as plt



## Load Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_folder = os.path.abspath('./local_data')
embeddings_names = ["all-MiniLM-L6-v2","bge-large-en-v1.5","finBERT"]
model_name = "all-MiniLM-L6-v2"
model_name_to_png = "bge-large-en-v1"
model_name_parquet = 'embeded_final_' + model_name + '.parquet'
file_name = os.path.join(data_folder, 'data_for_supervised', model_name_parquet)
data = pd.read_parquet(file_name)
data['movement_label'] = data['movement_label'].fillna(2)
len_x = len(data.columns) - 1
X = np.vstack(data.values[:,1:len_x] ).astype(np.float32)
y = data["movement_label"].astype(int).values
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)
# ====== 3) נרמול/סטנדרטיזציה (למרות ש-embeddings לרוב מנורמלים) ======
scaler = StandardScaler(with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# Logistic Regresssion model
Log_model = LogisticRegression()
Log_model.fit(X_train, y_train)

Log_y_preds = Log_model.predict(X_test)

Log_acc = accuracy_score(y_test, Log_y_preds)
 

print(f"Accuracy: {Log_acc * 100:.3f}%")


Log_conf_mat = confusion_matrix(y_test, Log_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(Log_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {Log_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/LogisticRegression-' + model_name_to_png
plt.savefig(png_path)

# KNN model
KNN_model = KNeighborsClassifier()
KNN_model.fit(X_train, y_train)

KNN_y_preds = KNN_model.predict(X_test)

KNN_acc = accuracy_score(y_test, KNN_y_preds)
  

print(f"Accuracy: {KNN_acc * 100:.3f}%")


KNN_conf_mat = confusion_matrix(y_test, KNN_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(KNN_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {KNN_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/KNeighborsClassifier-' + model_name_to_png
plt.savefig(png_path)

# Decision Tree model

DT_model = DecisionTreeClassifier()
DT_model.fit(X_train, y_train)

DT_y_preds = DT_model.predict(X_test)

DT_acc = accuracy_score(y_test, DT_y_preds)
 

print(f"Accuracy: {DT_acc * 100:.3f}%")


DT_conf_mat = confusion_matrix(y_test, DT_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(DT_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {DT_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/DecisionTreeClassifier-' + model_name_to_png
plt.savefig(png_path)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

rf_y_preds = rf_model.predict(X_test)

rf_acc = accuracy_score(y_test, rf_y_preds)


print(f"Accuracy: {rf_acc * 100:.3f}%")


rf_conf_mat = confusion_matrix(y_test, rf_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(rf_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {rf_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/RandomForestClassifier-' + model_name_to_png
plt.savefig(png_path)

# XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_y_preds = xgb_model.predict(X_test)

xgb_acc = accuracy_score(y_test, xgb_y_preds)


print(f"Accuracy: {xgb_acc * 100:.3f}%")


xgb_conf_mat = confusion_matrix(y_test, xgb_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(xgb_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {xgb_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/XGBClassifier-' + model_name_to_png
plt.savefig(png_path)

# SVC model
svc = SVC(probability=True)
svc_model = SVC(C=10, gamma = 0.01, probability=True)
svc_model.fit(X_train, y_train)
svc_y_preds = svc_model.predict(X_test)

svc_acc = accuracy_score(y_test, svc_y_preds)


print(f"Accuracy: {svc_acc * 100:.3f}%")


svc_conf_mat = confusion_matrix(y_test, svc_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(svc_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {svc_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/SVC-' + model_name_to_png
plt.savefig(png_path)

# Neural Network model

NN_model = MLPClassifier(random_state=42, max_iter=300).fit(X_train, y_train)
NN_y_preds = NN_model.predict(X_test)

NN_acc = accuracy_score(y_test, NN_y_preds)


print(f"Accuracy: {NN_acc * 100:.3f}%")


NN_conf_mat = confusion_matrix(y_test, NN_y_preds)
plt.figure(figsize=(5, 3))
sns.heatmap(NN_conf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix: {NN_acc:.3f}% Accuracy')
png_path = 'local_data/pictures/MLPClassifier-' + model_name_to_png
plt.savefig(png_path)

# Model comparison

models_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'KNN', 'SVC', 'Decision Tree Classifier', 'Random Forest Classifier', 'XGBoost'],
    'Score': [100*round(Log_acc,4), 100*round(KNN_acc,4), 100*round(svc_acc,4), 100*round(DT_acc,4), 100*round(rf_acc,4),
              100*round(xgb_acc,4)]
})
models_df.sort_values(by = 'Score', ascending = False)
path = 'local_data/' + 'results-' + model_name_to_png
models_df.to_csv(path)




Accuracy: 69.805%
Accuracy: 65.666%
Accuracy: 55.925%
Accuracy: 71.266%
Accuracy: 70.860%
Accuracy: 64.448%
Accuracy: 62.906%


### Print the reports that predicted 1-UP but its 0-DOWN

In [12]:
models_df

Unnamed: 0,Model,Score
0,Logistic Regression,69.81
1,KNN,65.67
2,SVM,64.45
3,Decision Tree Classifier,55.93
4,Random Forest Classifier,71.27
5,XGBoost,70.86


In [None]:
#PRINT y_test WHERE THE VALUE IS 0 AND THE VALUE IN rf_y_preds in the same index is 1

cv can also get simple numeric value, for example: cv=5

Read more about DecisionTreeClassifier hyperparameters: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

## DT representation
Read more: https://mljar.com/blog/visualize-decision-tree/