In [87]:
# !pip install psycopg2-binary
# !pip install yfinance
# !pip install torch
# !pip install scikit-learn

In [86]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import torch 
import torch.nn as nn 
import torch.optim as optim
import torch.functional as F
import psycopg2
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
from torch.utils.data import TensorDataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

In [11]:
def stock_ingestion(ticker_list):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=1825)
    
    stock_data = {}
    
    for ticker in ticker_list:
        try:
            stock = yf.Ticker(ticker)
            hist_data = stock.history(
                start=start_date.strftime('%Y-%m-%d'),
                end=end_date.strftime('%Y-%m-%d'),
                interval='1d'
            )
            hist_data['Ticker'] = ticker
            
            stock_data[ticker] = hist_data
        except Exception as e:
            print(f'could not rertieve info for {ticker}: {str(e)}')
    
    combined_data = pd.concat(stock_data.values())
    combined_data.reset_index(inplace=True)
    combined_data = combined_data[['Ticker', 'Date'] + [col for col in combined_data.columns if col not in ['Ticker', 'Date']]]
    
    
    return combined_data
    
ticker_list = ['MSFT', 'AAPL', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'JPM', 'V', 'MA', 'BAC', 
               'GS', 'JNJ', 'UNH', 'PFE', 'ABBV', 'WMT', 'PG','KO', 'PEP', 'CAT', 'BA', 'HON', 'DIS', 
               'NFLX', 'INTC', 'AMD', 'QCOM', 'XOM', 'CVX']
stock_data = stock_ingestion(ticker_list)

In [85]:
stock_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37710 entries, 0 to 37709
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype                           
---  ------        --------------  -----                           
 0   Ticker        37710 non-null  object                          
 1   Date          37710 non-null  datetime64[ns, America/New_York]
 2   Open          37710 non-null  float64                         
 3   High          37710 non-null  float64                         
 4   Low           37710 non-null  float64                         
 5   Close         37710 non-null  float64                         
 6   Volume        37710 non-null  int64                           
 7   Dividends     37710 non-null  float64                         
 8   Stock Splits  37710 non-null  float64                         
 9   year          37710 non-null  int32                           
 10  month         37710 non-null  int32                           
 11  da

In [14]:
stock_data.shape

(37710, 9)

In [15]:
stock_data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,37710.0,37710.0,37710.0,37710.0,37710.0,37710.0,37710.0
mean,168.698106,170.687184,166.687501,168.720364,39364830.0,0.009442,0.001858
std,124.412268,125.785261,123.017337,124.427389,90823010.0,0.104442,0.160879
min,4.984939,5.174166,4.501143,4.892763,460100.0,0.0,0.0
25%,79.991919,80.953674,78.86212,79.878531,5450325.0,0.0,0.0
50%,144.849906,146.307168,143.279384,144.765503,12385400.0,0.0,0.0
75%,208.986257,211.448502,206.471741,209.126301,33908080.0,0.0,0.0
max,833.669983,841.0,830.02002,837.26001,1543911000.0,3.0,20.0


In [16]:
print(len(stock_data))

37710


In [98]:
print(stock_data['Close'].min())
print(stock_data['Close'].max())

4.892763137817383
837.260009765625


In [92]:
def temporal_preprocessing(df, sequence_length=10, test_size=0.3):
    df['Date'] = pd.to_datetime(df['Date'])
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day 
    df['day_of_week'] = df['Date'].dt.dayofweek 
    
    features = ['Open', 'High', 'Low', 'Close', 'Volume', 
               'year', 'month', 'day', 'day_of_week']
    data = df[features].values
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    
    X, y = [], []
    
    for i in range(len(data_scaled) - sequence_length):
        X.append(data_scaled[i:(i + sequence_length)])
        y.append(data_scaled[i + sequence_length, 3]) #3 = index for close price (dependent variable)
    
    X, y = np.array(X), np.array(y)
    
    train_size = int(len(X) * (1 - test_size))
    
    X_train = X[:train_size]
    X_test = X[train_size:]
    y_train = y[:train_size]
    y_test = y[train_size:]
    
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    
    dataset_train = TensorDataset(X_train_tensor, y_train_tensor)
    dataloader_train = DataLoader(dataset=dataset, batch_size=32, 
                            shuffle=True, pin_memory=torch.cuda.is_available)
    
    dataset_test = TensorDataset(X_test_tensor, y_test_tensor)
    dataloader_test = DataLoader(dataset=dataset, batch_size=32, 
                                shuffle=True, pin_memory=torch.cuda.is_available)

    return dataloader_train, dataloader_test

dataloader_train, dataloader_test = temporal_preprocessing(stock_data)

In [94]:
input_size = 9 #num features 
hidden_size = 64
num_layers = 2
output_size = 1

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers,
                            batch_first=True, 
                            dropout=0.2
                           )
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        #initialize hidden and cell states with zeroes
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        
        return out 

model = LSTM(input_size, hidden_size, num_layers)
floss = nn.MSELoss()
optimizer = optim.SGD(params=model.parameters(), lr=0.0001, weight_decay=0)
epochs = 20 

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in dataloader_train:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = floss(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    epoch_loss /= len(dataloader_train)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    
model.eval()
test_loss = 0 
with torch.no_grad():
    for X_test, y_test in dataloader_test:
        outputs = model(X_test)
        loss = floss(outputs, y_test)
        test_loss += loss.item()

rmse = np.sqrt(test_loss)
print(rmse)

Epoch 1/20, Loss: 0.0635
Epoch 2/20, Loss: 0.0482
Epoch 3/20, Loss: 0.0385
Epoch 4/20, Loss: 0.0323
Epoch 5/20, Loss: 0.0285
Epoch 6/20, Loss: 0.0260
Epoch 7/20, Loss: 0.0244
Epoch 8/20, Loss: 0.0234
Epoch 9/20, Loss: 0.0228
Epoch 10/20, Loss: 0.0224
Epoch 11/20, Loss: 0.0222
Epoch 12/20, Loss: 0.0220
Epoch 13/20, Loss: 0.0219
Epoch 14/20, Loss: 0.0219
Epoch 15/20, Loss: 0.0218
Epoch 16/20, Loss: 0.0218
Epoch 17/20, Loss: 0.0218
Epoch 18/20, Loss: 0.0218
Epoch 19/20, Loss: 0.0218
Epoch 20/20, Loss: 0.0218
4.237044861582323


In [99]:
#to do: implement early stopping 
#push and consume data from postgres db 
#improve model - regularization, try better optimizers. likely adam