In [1]:
import yfinance as yf

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas_ta as ta
from multiprocessing.pool import Pool
from sklearn.manifold import TSNE
import pandas as pd
import pickle
import numpy as np
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.neural_network import MLPClassifier

In [3]:
tickers = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
print(tickers.head())

  Symbol     Security             GICS Sector               GICS Sub-Industry  \
0    MMM           3M             Industrials        Industrial Conglomerates   
1    AOS  A. O. Smith             Industrials               Building Products   
2    ABT       Abbott             Health Care           Health Care Equipment   
3   ABBV       AbbVie             Health Care                   Biotechnology   
4    ACN    Accenture  Information Technology  IT Consulting & Other Services   

     Headquarters Location  Date added      CIK      Founded  
0    Saint Paul, Minnesota  1957-03-04    66740         1902  
1     Milwaukee, Wisconsin  2017-07-26    91142         1916  
2  North Chicago, Illinois  1957-03-04     1800         1888  
3  North Chicago, Illinois  2012-12-31  1551152  2013 (1888)  
4          Dublin, Ireland  2011-07-06  1467373         1989  


In [4]:
%%time
with open("hist_data_2018-1-1_2023-7-12.pkl","rb") as f:
    data = pickle.load(f)

CPU times: total: 0 ns
Wall time: 34 ms


In [5]:
columns = list(data["Open"].columns)

In [6]:
coli_with_na = np.where(np.sum(pd.isna(data["Close"]))>0)[0]
for coli in coli_with_na:
    print(columns[coli])

ABNB
BF.B
BRK.B
CARR
CDAY
CEG
CTVA
DOW
FOX
FOXA
GEHC
KVUE
MRNA
OTIS
UBER
VLTO


In [7]:
opens= data["Open"].drop(columns=[columns[coli] for coli in coli_with_na])
closes = data["Close"].drop(columns=[columns[coli] for coli in coli_with_na])
# Open	High	Low	Close	Volume
highs = data["High"].drop(columns=[columns[coli] for coli in coli_with_na])
lows = data["Low"].drop(columns=[columns[coli] for coli in coli_with_na])
volumes = data["Volume"].drop(columns=[columns[coli] for coli in coli_with_na])

In [8]:
opens.shape,closes.shape,highs.shape,lows.shape,volumes.shape

((1389, 487), (1389, 487), (1389, 487), (1389, 487), (1389, 487))

In [9]:
ticker="TSLA"
training_X=[]
training_Y=[]
correlation_Y = []
temp_df_full = pd.DataFrame(data={"Open":opens[ticker],
                         "High":highs[ticker],
                         "Low":lows[ticker],
                         "Close":closes[ticker],
                         "Volume":volumes[ticker],
                        })

In [10]:
def clamp(df,w_min,w_max):
    return (df-w_min)/(w_max-w_min)

In [11]:
%%time
with open("hist_data_2018-1-1_2023-7-12_3_0_2.pkl","rb") as f:
    [training_X,training_Y,correlation_Y] = pickle.load(f)

CPU times: total: 15.6 ms
Wall time: 12 ms


In [12]:
training_X.shape,training_Y.shape,correlation_Y.shape

((66232, 14), (66232,), (66232,))

In [13]:
threshold = 0.02
train_data_y_discrete = np.asarray(list(map(lambda x: 2 if (x>threshold) else (1 if x<-threshold else 0),training_Y)))
Counter(train_data_y_discrete)

Counter({0: 52931, 1: 7168, 2: 6133})

In [14]:
sel = np.hstack([np.where(train_data_y_discrete!=0)[0],np.where(train_data_y_discrete==0)[0][::8]])

In [15]:
Counter(train_data_y_discrete[sel])

Counter({1: 7168, 2: 6133, 0: 6617})

In [62]:
#X_train, X_test, y_train, y_test = train_test_split(training_X[sel], train_data_y_discrete[sel], test_size=0.3,random_state=109,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(training_X, train_data_y_discrete, test_size=0.3,random_state=109,shuffle=True)

In [63]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((46362, 14), (19870, 14), (46362,), (19870,))

In [64]:
import joblib

# use pytorch

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLPClassifierPyTorch(nn.Module):
    def __init__(self, input_size, num_classes=3):
        super(MLPClassifierPyTorch, self).__init__()
        self.fc1 = nn.Linear(input_size, 150)
        self.fc2 = nn.Linear(150, 150)
        self.fc3 = nn.Linear(150, 50)
        self.fc4 = nn.Linear(50, 10)
        self.fc5 = nn.Linear(10, 50)
        self.fc6 = nn.Linear(50, 150)
        self.fc7 = nn.Linear(150, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = self.fc7(x)  # No activation here, CrossEntropyLoss will apply softmax
        return x

In [67]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score

In [82]:
len(train_data_y_discrete)

66232

In [74]:
# Assuming 'X' and 'y' are your features and labels with shapes (1000, 14) and (1000,)
# Convert them to PyTorch tensors
#X = torch.tensor(training_X[sel], dtype=torch.float32)
#y = torch.tensor(train_data_y_discrete[sel], dtype=torch.long)
X = torch.tensor(training_X, dtype=torch.float32)
y = torch.tensor(train_data_y_discrete, dtype=torch.long)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15,shuffle=True)

# Create DataLoader for training and testing sets
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

batch_size = 5000#128*4
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


input_size = 14
model = MLPClassifierPyTorch(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [75]:
%%time
# Training loop
loss_history = []
for epoch in range(250):
    print(epoch,end=" ")
    avg_loss = []
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        avg_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    loss_history.append(np.mean(avg_loss))
    print(f" {loss.item():.2f},",end=" ")

0  0.90, 1  0.66, 2  0.63, 3  0.64, 4  0.63, 5  0.62, 6  0.63, 7  0.64, 8  0.64, 9  0.63, 10  0.64, 11  0.66, 12  0.64, 13  0.67, 14  0.65, 15  0.67, 16  0.62, 17  0.62, 18  0.63, 19  0.65, 20  0.65, 21  0.64, 22  0.67, 23  0.67, 24  0.66, 25  0.65, 26  0.62, 27  0.64, 28  0.64, 29  0.62, 30  0.65, 31  0.64, 32  0.63, 33  0.65, 34  0.63, 35  0.64, 36  0.64, 37  0.65, 38  0.63, 39  0.64, 40  0.64, 41  0.61, 42  0.62, 43  0.64, 44  0.66, 45  0.63, 46  0.63, 47  0.63, 48  0.62, 49  0.63, 50  0.62, 51  0.63, 52  0.61, 53  0.65, 54  0.62, 55  0.62, 56  0.65, 57  0.62, 58  0.65, 59  0.64, 60  0.60, 61  0.61, 62  0.63, 63  0.61, 64  0.62, 65  0.63, 66  0.60, 67  0.66, 68  0.61, 69  0.62, 70  0.64, 71  0.62, 72  0.62, 73  0.63, 74  0.63, 75  0.63, 76  0.65, 77  0.65, 78  0.65, 79  0.61, 80  0.65, 81  0.60, 82  0.64, 83  0.62, 84  0.64, 85  0.62, 86  0.64, 87  0.63, 88  0.62, 89  0.64, 90  0.66, 91  0.64, 92  0.63, 93  0.62, 94  0.64, 95  0.63, 96  0.65, 97  0.63, 98  0.60, 99  0.67, 100  0.64,

In [76]:
%%time
# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate precision and recall for each class
precision = precision_score(all_labels, all_predictions, average=None)
recall = recall_score(all_labels, all_predictions, average=None)

# Print precision and recall for each class
for i in range(len(precision)):
    print(f'Class {i} - Precision: {precision[i]:.2f}, Recall: {recall[i]:.2f}')

Class 0 - Precision: 0.80, Recall: 1.00
Class 1 - Precision: 0.00, Recall: 0.00
Class 2 - Precision: 0.00, Recall: 0.00
CPU times: total: 484 ms
Wall time: 613 ms


  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
torch.cuda.is_available()

True

## running on gpu

In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X = torch.tensor(training_X, dtype=torch.float32).to(device)
y = torch.tensor(train_data_y_discrete, dtype=torch.long).to(device)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, shuffle=True)

# Create DataLoader for training and testing sets
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

batch_size = 2000
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Initialize the model and move it to the selected device
input_size = 14
model = MLPClassifierPyTorch(input_size).to(device)

criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0, 30.0]).to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [84]:
%%time
# Training loop
loss_history = []
for epoch in range(250):
    print(epoch, end=" ")
    avg_loss = []
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        avg_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    loss_history.append(np.mean(avg_loss))
    print(f" {loss.item():.2f},", end=" ")

0  0.68, 1  0.66, 2  0.64, 3  0.63, 4  0.65, 5  0.63, 6  0.58, 7  0.69, 8  0.67, 9  0.62, 10  0.60, 11  0.62, 12  0.64, 13  0.69, 14  0.69, 15  0.60, 16  0.67, 17  0.60, 18  0.68, 19  0.67, 20  0.70, 21  0.61, 22  0.66, 23  0.64, 24  0.66, 25  0.61, 26  0.62, 27  0.61, 28  0.58, 29  0.61, 30  0.60, 31  0.66, 32  0.68, 33  0.69, 34  0.68, 35  0.62, 36  0.61, 37  0.71, 38  0.63, 39  0.62, 40  0.68, 41  0.63, 42  0.61, 43  0.61, 44  0.69, 45  0.69, 46  0.67, 47  0.65, 48  0.61, 49  0.68, 50  0.62, 51  0.56, 52  0.71, 53  0.64, 54  0.63, 55  0.61, 56  0.64, 57  0.61, 58  0.63, 59  0.67, 60  0.61, 61  0.61, 62  0.62, 63  0.63, 64  0.65, 65  0.61, 66  0.68, 67  0.64, 68  0.59, 69  0.63, 70  0.62, 71  0.67, 72  0.60, 73  0.67, 74  0.65, 75  0.65, 76  0.66, 77  0.63, 78  0.60, 79  0.61, 80  0.63, 81  0.66, 82  0.65, 83  0.65, 84  0.65, 85  0.63, 86  0.66, 87  0.60, 88  0.62, 89  0.63, 90  0.64, 91  0.65, 92  0.61, 93  0.64, 94  0.65, 95  0.60, 96  0.64, 97  0.59, 98  0.68, 99  0.62, 100  0.57,

In [85]:
# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate precision and recall for each class
precision = precision_score(all_labels, all_predictions, average=None)
recall = recall_score(all_labels, all_predictions, average=None)

# Print precision and recall for each class
for i in range(len(precision)):
    print(f'Class {i} - Precision: {precision[i]:.2f}, Recall: {recall[i]:.2f}')

Class 0 - Precision: 0.88, Recall: 0.00
Class 1 - Precision: 0.00, Recall: 0.00
Class 2 - Precision: 0.09, Recall: 1.00


  _warn_prf(average, modifier, msg_start, len(result))


# custom loss

In [107]:
import torch.nn.functional as F

class CombinedLoss(nn.Module):
    def __init__(self, alpha=0.5, beta=0.5):
        super(CombinedLoss, self).__init__()
        self.alpha = alpha  # Interpolation factor for combining losses
        self.beta = beta  # Balancing factor between false positives and false negatives for class 2
        self.cross_entropy_loss = nn.CrossEntropyLoss()

    def forward(self, outputs, targets):
        # Standard CrossEntropyLoss
        ce_loss = self.cross_entropy_loss(outputs, targets)

        # Custom loss focusing on label 2 (both false positives and false negatives)
        probabilities = F.softmax(outputs, dim=1)
        label_2_probs = probabilities[:, 2]

        # Penalize false positives for label 2
        non_label_2_mask = (targets != 2)
        false_positives = label_2_probs * non_label_2_mask.float()

        # Penalize false negatives for label 2
        label_2_mask = (targets == 2)
        false_negatives = (1 - label_2_probs) * label_2_mask.float()

        # Combined custom loss for label 2
        custom_loss = self.beta * false_positives.sum() + (1 - self.beta) * false_negatives.sum()

        # Combined loss
        combined_loss = (1 - self.alpha) * ce_loss + self.alpha * custom_loss
        return combined_loss

In [111]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X = torch.tensor(training_X, dtype=torch.float32).to(device)
y = torch.tensor(train_data_y_discrete, dtype=torch.long).to(device)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, shuffle=True)

# Create DataLoader for training and testing sets
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

batch_size = 2000
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Initialize the model and move it to the selected device
input_size = 14
model = MLPClassifierPyTorch(input_size).to(device)

criterion = CombinedLoss(alpha=0.5,beta=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [112]:
%%time
# Training loop
loss_history = []
for epoch in range(50):
    if epoch%10==0:
        print(epoch, end=" ")
    avg_loss = []
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        avg_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    loss_history.append(np.mean(avg_loss))
    print(f" {loss.item():.2f},", end=" ")
    if epoch%10==0:
        print()

0  27.41, 
 21.67,  24.16,  22.40,  22.10,  22.33,  22.85,  19.55,  26.41,  25.13, 10  19.03, 
 19.54,  23.34,  25.63,  24.36,  22.59,  20.83,  23.11,  28.18,  22.85, 20  18.01, 
 28.69,  23.88,  22.09,  25.90,  21.55,  24.87,  22.83,  23.35,  26.16, 30  25.38, 
 24.13,  25.88,  25.90,  27.16,  20.57,  23.60,  24.62,  26.42,  23.60, 40  23.36, 
 23.12,  22.84,  23.33,  23.10,  28.18,  22.31,  20.57,  23.61,  25.65, CPU times: total: 38.9 s
Wall time: 1min 26s


In [113]:
# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate precision and recall for each class
precision = precision_score(all_labels, all_predictions, average=None,zero_division=0)
recall = recall_score(all_labels, all_predictions, average=None,zero_division=0)

# Print precision and recall for each class
for i in range(len(precision)):
    print(f'Class {i} - Precision: {precision[i]:.2f}, Recall: {recall[i]:.2f}')

Class 0 - Precision: 0.80, Recall: 1.00
Class 1 - Precision: 0.00, Recall: 0.00
Class 2 - Precision: 0.00, Recall: 0.00
