Copyright (c) 2023 Bronte Sihan Li

License: MIT License

# Project 5: Going further with Deep Learning

## 2. Implement two ML methods for 1-day binary prediction
* Random Forest
* ANN

### Data Preprocessing

In [10]:
import pandas as pd

df = pd.read_csv('data/day1prediction.csv', index_col=0)
df.shape

(1984, 416)

In [11]:
print(df.columns)

Index(['Close', 'Volume', 'mom', 'mom1', 'mom2', 'mom3', 'ROC_5', 'ROC_10',
       'ROC_15', 'ROC_20',
       ...
       'Dollar index.4', 'wheat-F.4', 'XAG.4', 'XAU.4', 'DJI_label',
       'NASDAQ_label', 'NYSE_label', 'RUSSELL_label', 'S&P_label',
       'train_test'],
      dtype='object', length=416)


In [12]:
from sklearn.model_selection import train_test_split

LABEL_COLS = 'DJI_label,NASDAQ_label,NYSE_label,RUSSELL_label,S&P_label'.split(',')
TRAIN_TEST_COL = 'train_test'

# Split data into train, validation and test
train_df = df[df[TRAIN_TEST_COL] == 0]
test_df = df[df[TRAIN_TEST_COL] == 1]

X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=LABEL_COLS + [TRAIN_TEST_COL]), train_df[LABEL_COLS], test_size=0.2)
X_test = test_df.drop(columns=LABEL_COLS + [TRAIN_TEST_COL])
y_test = test_df[LABEL_COLS]



In [13]:
X_train.shape, X_val.shape, X_test.shape

((1410, 410), (353, 410), (221, 410))

In [14]:
X_train.to_csv('data/day1/train.csv')
y_train.to_csv('data/day1/train_labels.csv')
X_val.to_csv('data/day1/val.csv')
y_val.to_csv('data/day1/val_labels.csv')
X_test.to_csv('data/day1/test.csv')
y_test.to_csv('data/day1/test_labels.csv')

### Getting a baseline with Random Forest Classifier
* Normalize data
* PCA
* Train random forest classifier

In [16]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Create pipeline
pipeline = Pipeline([
    ('pca', PCA(n_components='mle', whiten=True, random_state=0)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0))
])
pipeline.fit(X_train, y_train)
pipeline.score(X_val, y_val)

0.5439093484419264

In [7]:
print(X_train.shape)

(1410, 410)


In [7]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score

print(accuracy_score(y_val, pipeline.predict(X_val)))
print(classification_report(y_val, pipeline.predict(X_val), target_names=LABEL_COLS))

0.5977337110481586
               precision    recall  f1-score   support

    DJI_label       0.78      0.66      0.72       164
 NASDAQ_label       0.85      0.69      0.76       159
   NYSE_label       0.80      0.68      0.73       166
RUSSELL_label       0.82      0.68      0.75       165
    S&P_label       0.82      0.68      0.74       161

    micro avg       0.81      0.68      0.74       815
    macro avg       0.81      0.68      0.74       815
 weighted avg       0.81      0.68      0.74       815
  samples avg       0.34      0.35      0.33       815



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

multi_confusion_matrix = multilabel_confusion_matrix(y_val, pipeline.predict(X_val))

# Plot confusion matrix for each label
fig = make_subplots(rows=2, cols=3, subplot_titles=LABEL_COLS)
for i, label in enumerate(LABEL_COLS):
    fig.add_trace(
        go.Heatmap(
            z=multi_confusion_matrix[i],
            x=['Predicted Down', 'Predicted Up'],
            y=['Actual Down', 'Actual Up'],
            colorscale='Viridis',
            showscale=False,
            name=label,
            text = multi_confusion_matrix[i],
            texttemplate = '%{text}',
        ),
        row=i // 3 + 1,
        col=i % 3 + 1,
    )
fig.update_layout(height=800, width=800, title_text='Confusion Matrix for Each Label')
fig.show()


## 3. Implement an ML method that uses more than one day to predict each market index

### Preprocessing

To analyze the data with a 2D convolutional neural network using the last 10 days of data, we will do the following preprocessing steps:
* Split the into training data into training and validation sets by data
* Perform PCA on the training data to reduce the dimensionality of the data, this is done in `part3_4.py`.

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=LABEL_COLS + [TRAIN_TEST_COL]), train_df[LABEL_COLS], test_size=0.2,
    shuffle=False, stratify=None)

X_train.to_csv('data/day10/train.csv')
y_train.to_csv('data/day10/train_labels.csv')
X_val.to_csv('data/day10/val.csv')
y_val.to_csv('data/day10/val_labels.csv')


In [None]:
X_test = test_df.drop(columns=LABEL_COLS + [TRAIN_TEST_COL])
y_test = test_df[LABEL_COLS]
X_test.to_csv('data/day10/test.csv')
y_test.to_csv('data/day10/test_labels.csv')

In [2]:
from utils.net import StockCNN

# Create model
model = StockCNN(10, 20)
print(model)

StockCNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1), padding=same)
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv3): Conv2d(20, 40, kernel_size=(5, 5), stride=(1, 1))
  (conv3_drop): Dropout2d(p=0.3, inplace=False)
  (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
  (activation): ReLU()
  (fc2): Linear(in_features=10, out_features=20, bias=True)
  (fc3): Linear(in_features=20, out_features=5, bias=True)
  (sigmoid): Sigmoid()
)


