<a href="https://colab.research.google.com/github/beatricebianchi3/FINTECH-ML---Driven-early-warning-system/blob/main/DecisionTree_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### DECISION TREE AND RANDOM FOREST

A decision tree for classification detects warnings (indicated by 1) by recursively splitting the data based on feature values to separate warnings from non-warnings (0), ultimately predicting the class at the leaf nodes. While simple and interpretable, decision trees can overfit the data. In contrast, a random forest, which is an ensemble of multiple decision trees, addresses this by training each tree on different subsets of the data and features, then aggregating their predictions through majority voting.



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# import the third page of FinancialM
df= pd.read_excel('FinancialMarketData.xlsx', sheet_name=2)

# save in y the output (market crashes equal to 1)
y = df.iloc[:, 0]

# drop the first column (output), drop the second column (dates)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(df.columns[0], axis=1, inplace=True)

df.head()


Unnamed: 0,XAU BGNL,ECSURPUS,BDIY,CRY,DXY,JPY,GBP,Cl1,VIX,USGG30YR,...,LP01TREU,EMUSTRUU,LF94TRUU,MXUS,MXEU,MXJP,MXBR,MXRU,MXIN,MXCN
0,283.25,0.077,1388,157.26,100.56,105.86,1.646,25.77,22.5,6.671,...,116.4635,230.5267,123.7616,1416.12,127.75,990.59,856.76,224.33,217.34,34.3
1,287.65,0.043,1405,165.01,101.86,105.47,1.6383,28.85,21.5,6.747,...,117.2674,231.377,123.7616,1428.79,129.5,993.98,925.22,234.37,227.08,32.74
2,287.15,0.135,1368,167.24,102.41,106.04,1.6496,28.28,23.02,6.634,...,117.9946,232.3895,123.7616,1385.93,126.48,974.83,886.93,216.82,233.0,32.46
3,282.75,0.191,1311,166.85,104.92,107.85,1.6106,28.22,23.45,6.423,...,120.51,231.9417,122.3281,1385.31,129.19,1007.12,842.6,201.89,237.48,31.29
4,298.4,0.312,1277,165.43,104.22,109.3,1.6108,28.02,21.25,6.231,...,118.7914,237.8117,122.3281,1411.95,134.67,1034.58,945.15,218.0,258.02,31.32


In [None]:
# check for nan values
nan_column = df.isna().sum()
print("Number of NaN in each column:")
print(nan_column)

# drop nan values
df = df.dropna()
print("\nDataFrame senza righe contenenti NaN:")
print(df)



Number of NaN in each column:
XAU BGNL     0
ECSURPUS     0
BDIY         0
CRY          0
DXY          0
JPY          0
GBP          0
Cl1          0
VIX          0
USGG30YR     0
GT10         0
USGG2YR      0
USGG3M       0
US0001M      0
GTDEM30Y     0
GTDEM10Y     0
GTDEM2Y      0
EONIA        0
GTITL30YR    0
GTITL10YR    0
GTITL2YR     0
GTJPY30YR    0
GTJPY10YR    0
GTJPY2YR     0
GTGBP30Y     0
GTGBP20Y     0
GTGBP2Y      0
LUMSTRUU     0
LMBITR       0
LUACTRUU     0
LF98TRUU     0
LG30TRUU     0
LP01TREU     0
EMUSTRUU     0
LF94TRUU     0
MXUS         0
MXEU         0
MXJP         0
MXBR         0
MXRU         0
MXIN         0
MXCN         0
dtype: int64

DataFrame senza righe contenenti NaN:
      XAU BGNL  ECSURPUS  BDIY       CRY      DXY     JPY     GBP    Cl1  \
0       283.25     0.077  1388  157.2600  100.560  105.86  1.6460  25.77   
1       287.65     0.043  1405  165.0100  101.860  105.47  1.6383  28.85   
2       287.15     0.135  1368  167.2400  102.410  106.04  1

In [None]:
# standard scaler
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)
column_names = df.columns
column_names = column_names.tolist()

# create the dataframe for the standardized values
df_standardized_pd = pd.DataFrame(df_standardized, columns=column_names)
df_standardized_pd


Unnamed: 0,XAU BGNL,ECSURPUS,BDIY,CRY,DXY,JPY,GBP,Cl1,VIX,USGG30YR,...,LP01TREU,EMUSTRUU,LF94TRUU,MXUS,MXEU,MXJP,MXBR,MXRU,MXIN,MXCN
0,-1.424377,0.116941,-0.432277,-1.289246,0.846232,-0.058102,0.359879,-1.349527,0.290316,2.345104,...,-1.052593,-1.529484,-1.782485,-0.273397,0.956041,1.028789,-1.061289,-1.419419,-1.173857,-0.793990
1,-1.415478,0.019602,-0.423848,-1.175689,0.960528,-0.088487,0.323600,-1.230558,0.174552,2.409617,...,-1.045276,-1.526739,-1.782485,-0.255250,1.040230,1.045740,-0.991858,-1.385836,-1.149828,-0.857078
2,-1.416489,0.282989,-0.442193,-1.143014,1.008884,-0.044078,0.376841,-1.252575,0.350514,2.313697,...,-1.038657,-1.523471,-1.782485,-0.316639,0.894944,0.949983,-1.030691,-1.444539,-1.135224,-0.868402
3,-1.425389,0.443312,-0.470454,-1.148728,1.229563,0.096943,0.193090,-1.254892,0.400292,2.134589,...,-1.015762,-1.524916,-1.801371,-0.317527,1.025317,1.111444,-1.075649,-1.494479,-1.124171,-0.915718
4,-1.393734,0.789723,-0.487312,-1.169535,1.168019,0.209916,0.194033,-1.262618,0.145611,1.971609,...,-1.031405,-1.505970,-1.801371,-0.279370,1.288948,1.248754,-0.971646,-1.440592,-1.073498,-0.914505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,1.497764,0.867021,0.005523,-0.890078,0.123178,0.164727,-0.912241,-0.113874,0.035636,-1.342820,...,1.781110,1.759206,1.723279,3.142187,1.560758,2.073760,-0.234767,0.108805,2.473954,2.300980
1107,1.412004,0.763956,-0.077773,-0.884629,0.207669,0.287049,-0.931558,-0.006107,-0.044241,-1.306913,...,1.788943,1.736163,1.670111,3.192590,1.678141,2.107362,-0.271440,0.149513,2.461125,2.218884
1108,1.528751,1.156174,-0.083227,-0.870486,0.123090,0.255105,-0.883029,-0.053231,-0.216730,-1.345706,...,1.807212,1.761959,1.734240,3.363323,1.744049,2.049008,-0.216624,0.082013,2.457967,2.345061
1109,1.527558,1.233472,-0.059428,-0.836822,0.080624,0.201346,-0.923077,-0.020399,-0.386903,-1.370407,...,1.804880,1.769406,1.730325,3.466292,1.748859,2.061209,-0.225549,0.105527,2.415978,2.202708


In [None]:
# split the dataset in training (80%) and test (20%) sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df_standardized_pd, y, test_size=0.2, random_state=42)

# split the temp dataset in training (75%) e validation (25%)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=42)

print("Number of instances in training set:", df_standardized_pd.shape[0])
print("Number of instances in training set:", X_train.shape[0])
print("Number of instances in validation set:", X_val.shape[0])
print("Number of instances in test set:", X_test.shape[0])

Number of instances in training set: 1111
Number of instances in training set: 666
Number of instances in validation set: 222
Number of instances in test set: 223


### Buckets for the covariates

In [None]:
bond_train = X_train.iloc[:, 9:35]
equity_train = X_train.iloc[:, 35:42]
commodities_train = X_train.iloc[:, [0,3,7]]
indexes_train = X_train.iloc[:, [1,2,4,5,6,8]]

# print the column in each bucket to check them
print("Bonds: ", bond_train.columns.tolist())
print("Equity: ", equity_train.columns.tolist())
print("Commodities: ", commodities_train.columns.tolist())
print("Indexes: ", indexes_train.columns.tolist())

Bonds:  ['USGG30YR', 'GT10', 'USGG2YR', 'USGG3M', 'US0001M', 'GTDEM30Y', 'GTDEM10Y', 'GTDEM2Y', 'EONIA', 'GTITL30YR', 'GTITL10YR', 'GTITL2YR', 'GTJPY30YR', 'GTJPY10YR', 'GTJPY2YR', 'GTGBP30Y', 'GTGBP20Y', 'GTGBP2Y', 'LUMSTRUU', 'LMBITR', 'LUACTRUU', 'LF98TRUU', 'LG30TRUU', 'LP01TREU', 'EMUSTRUU', 'LF94TRUU']
Equity:  ['MXUS', 'MXEU', 'MXJP', 'MXBR', 'MXRU', 'MXIN', 'MXCN']
Commodities:  ['XAU BGNL', 'CRY', 'Cl1']
Indexes:  ['ECSURPUS', 'BDIY', 'DXY', 'JPY', 'GBP', 'VIX']


In [None]:
bond_val = X_val.iloc[:, 9:35]
equity_val = X_val.iloc[:, 35:42]
commodities_val = X_val.iloc[:, [0,3,7]]
indexes_val = X_val.iloc[:, [1,2,4,5,6,8]]

In [None]:
bond_test = X_test.iloc[:, 9:35]
equity_test = X_test.iloc[:, 35:42]
commodities_test = X_test.iloc[:, [0,3,7]]
indexes_test = X_test.iloc[:, [1,2,4,5,6,8]]

test of the model of each bucket on the validation set
printing accuracy, f1 score, precision and recall

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


clf = DecisionTreeClassifier()
clf.fit(bond_train, y_train)
y_pred_bond = clf.predict(bond_val)

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(bond_train, y_train)
y_pred_bond_rf = rf_clf.predict(bond_val)

accuracy = accuracy_score(y_val, y_pred_bond)
precision = precision_score(y_val, y_pred_bond, average='weighted')
recall = recall_score(y_val, y_pred_bond, average='weighted')
f1 = f1_score(y_val, y_pred_bond, average='weighted')

print("Decision tree report for bond:")

report = classification_report(y_val, y_pred_bond)

print(report)

accuracy_rf = accuracy_score(y_val, y_pred_bond_rf)
precision_rf = precision_score(y_val, y_pred_bond_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_bond_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_bond_rf, average='weighted')

#print("Accuracy:", accuracy)
#print("Precision:", precision)
#print("Recall:", recall)
#print("F1 Score:", f1)

#print("Accuracy rf:", accuracy_rf)
#print("Precision rf:", precision_rf)
#print("Recall rf:", recall_rf)
#print("F1 Score rf:", f1_rf)

report = classification_report(y_val, y_pred_bond_rf)

print("Random forest report for bond:")

# Print the report
print(report)


Decision tree report for bond:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       179
           1       0.61      0.77      0.68        43

    accuracy                           0.86       222
   macro avg       0.78      0.83      0.80       222
weighted avg       0.88      0.86      0.87       222

Random forest report for bond:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       179
           1       0.79      0.70      0.74        43

    accuracy                           0.91       222
   macro avg       0.86      0.83      0.84       222
weighted avg       0.90      0.91      0.90       222



In [None]:
clf.fit(equity_train, y_train)
y_pred_equity = clf.predict(equity_val)

rf_clf.fit(equity_train, y_train)
y_pred_equity_rf = rf_clf.predict(equity_val)


accuracy = accuracy_score(y_val, y_pred_equity)
precision = precision_score(y_val, y_pred_equity, average='weighted')
recall = recall_score(y_val, y_pred_equity, average='weighted')
f1 = f1_score(y_val, y_pred_equity, average='weighted')

accuracy_rf = accuracy_score(y_val, y_pred_equity_rf)
precision_rf = precision_score(y_val, y_pred_equity_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_equity_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_equity_rf, average='weighted')

print("Decision tree report for equity:")

report = classification_report(y_val, y_pred_equity)

print(report)
"""
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy rf:", accuracy_rf)
print("Precision rf:", precision_rf)
print("Recall rf:", recall_rf)
print("F1 Score rf:", f1_rf)
"""
report = classification_report(y_val, y_pred_equity_rf)

print("Random forest report for equity:")


# Print the report
print(report)

Decision tree report for equity:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       179
           1       0.56      0.56      0.56        43

    accuracy                           0.83       222
   macro avg       0.73      0.73      0.73       222
weighted avg       0.83      0.83      0.83       222

Random forest report for equity:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       179
           1       0.74      0.60      0.67        43

    accuracy                           0.88       222
   macro avg       0.83      0.78      0.80       222
weighted avg       0.88      0.88      0.88       222



In [None]:
clf.fit(commodities_train, y_train)
y_pred_commodities = clf.predict(commodities_val)

rf_clf.fit(commodities_train, y_train)
y_pred_commodities_rf = rf_clf.predict(commodities_val)

accuracy = accuracy_score(y_val, y_pred_commodities)
precision = precision_score(y_val, y_pred_commodities, average='weighted')
recall = recall_score(y_val, y_pred_commodities, average='weighted')
f1 = f1_score(y_val, y_pred_commodities, average='weighted')

accuracy_rf = accuracy_score(y_val, y_pred_commodities_rf)
precision_rf = precision_score(y_val, y_pred_commodities_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_commodities_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_commodities_rf, average='weighted')

print("Decision tree report for commodities:")

report = classification_report(y_val, y_pred_commodities)

print(report)
""""
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy rf:", accuracy_rf)
print("Precision rf:", precision_rf)
print("Recall rf:", recall_rf)
print("F1 Score rf:", f1_rf)
"""

print("Random forest report for commodities:")

report = classification_report(y_val, y_pred_commodities_rf)

# Print the report
print(report)

Decision tree report for commodities:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       179
           1       0.56      0.56      0.56        43

    accuracy                           0.83       222
   macro avg       0.73      0.73      0.73       222
weighted avg       0.83      0.83      0.83       222

Random forest report for commodities:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       179
           1       0.72      0.67      0.70        43

    accuracy                           0.89       222
   macro avg       0.82      0.81      0.81       222
weighted avg       0.88      0.89      0.89       222



In [None]:
clf.fit(indexes_train, y_train)
y_pred_indexes = clf.predict(indexes_val)

rf_clf.fit(indexes_train, y_train)
y_pred_indexes_rf = rf_clf.predict(indexes_val)

accuracy = accuracy_score(y_val, y_pred_indexes)
precision = precision_score(y_val, y_pred_indexes, average='weighted')
recall = recall_score(y_val, y_pred_indexes, average='weighted')
f1 = f1_score(y_val, y_pred_indexes, average='weighted')

accuracy_rf = accuracy_score(y_val, y_pred_indexes_rf)
precision_rf = precision_score(y_val, y_pred_indexes_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_indexes_rf, average='weighted')
f1_rf = f1_score(y_val, y_pred_indexes_rf, average='weighted')


print("Decision tree report for indexes:")

report = classification_report(y_val, y_pred_indexes)

print(report)
"""
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy rf:", accuracy_rf)
print("Precision rf:", precision_rf)
print("Recall rf:", recall_rf)
print("F1 Score rf:", f1_rf)
"""
report = classification_report(y_val, y_pred_indexes_rf)

print("Random forerst report for indexes:")

# Print the report
print(report)

Decision tree report for indexes:
              precision    recall  f1-score   support

           0       0.93      0.88      0.91       179
           1       0.60      0.74      0.67        43

    accuracy                           0.86       222
   macro avg       0.77      0.81      0.79       222
weighted avg       0.87      0.86      0.86       222

Random forerst report for indexes:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       179
           1       0.77      0.70      0.73        43

    accuracy                           0.90       222
   macro avg       0.85      0.82      0.84       222
weighted avg       0.90      0.90      0.90       222



Random forest approach reduces overfitting and enhances accuracy, offering better overall performance. Importantly, random forests tend to provide superior recall, which is crucial for our goal of detecting warnings, ensuring that more actual warnings are correctly identified.