<a href="https://colab.research.google.com/github/aidanjmaldonado/penny-stock-lstm/blob/main/penny_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import sqlite3
import requests
import sys
from library.DataSetProcessor import DataSetProcessor
import matplotlib.pyplot as plt

# Create database

In [371]:
# Download database from github repository
historical_url = "https://raw.githubusercontent.com/CSE-115-UCSC/penny-stock-lstm/main/historicaldata.db"
scrape_request = requests.get(historical_url)

try:
  # Contingent on request status
  scrape_request = requests.get(historical_url)
  scrape_request.raise_for_status()

  # Create local database from pull, name 'historicaldata.db'
  with open("historical.db", "wb") as db_file:
    db_file.write(scrape_request.content)

  print("Request to download database succeeded")


except:
  # Report failed request status
  sys.stderr.write("Request to download database failed")

In [None]:
# Connect to SQlite database
try:
    db = 'historical.db'
    sqliteConnection = sqlite3.connect(db)
    cursor = sqliteConnection.cursor()
    print(f'SQlite connected with {db}')

except:
    sys.stderr.write("Failed to connect to database")

In [None]:
# Query {ticker(s)} from Database
try:
    query = f"SELECT * FROM all_historical;"
    cursor.execute(query)
    if cursor.fetchone() is None:
        raise Exception("No results")

    print(f"Success querying all historical")
    # Turn SQlite Database into Pandas Dataframe
    data = pd.read_sql_query(query, sqliteConnection)

except:
    sys.stderr.write(f"Failed to select all historical")

In [None]:
# Using the column 'time' (millisecond) add a new column 'dates' with datetime
dates = pd.to_datetime(data['time'], unit='ms')
tickers = data['ticker']
dates = dates.dt.tz_localize('UTC').dt.tz_convert('US/Pacific')
dates = dates.dt.tz_localize(None)

# Dataset Normalization

In [None]:
normalized_data = pd.DataFrame(columns=['close','volume'])

In [None]:
dataByTicker = {}
for ticker in data['ticker'].unique():
    dataByTicker[ticker] = data[data['ticker'] == ticker].copy()
    dataByTicker[ticker]['closeNorm'] = dataByTicker[ticker]['close'] / dataByTicker[ticker]['close'].max() #Normalized closing price data

In [None]:
for key in dataByTicker:
    # Create a temporary DataFrame to hold the current data
    temp_df = pd.DataFrame({
        'close': dataByTicker[key]['closeNorm'],
        'volume': dataByTicker[key]['volume']
    })
    
    # Concatenate the temporary DataFrame to the normalized_data DataFrame
    normalized_data = pd.concat([normalized_data, temp_df], ignore_index=True)

# Optionally, you can reset the index if needed
normalized_data.reset_index(drop=True, inplace=True)

# Train on all historical stock data, sequenced

In [None]:

# Function to generate sequenced arrays based on sequence and prediction lengths
def create_sequences(data, SEQUENCE_LENGTH, PREDICTION_LENGTH):
    xs, ys = [], []
    index = 0
    count = 0
    while index < len(data) - SEQUENCE_LENGTH - PREDICTION_LENGTH + 1:
        # Check if sequence is within a single day
        if dates[index].date() == dates[index + SEQUENCE_LENGTH].date() and tickers[index] == tickers[index + SEQUENCE_LENGTH]:
            xs.append(data.iloc[index:index + SEQUENCE_LENGTH])  # Use past data for features
            ys.append(data.iloc[index + SEQUENCE_LENGTH:index + SEQUENCE_LENGTH + PREDICTION_LENGTH, 0])  # Only predict 'close' prices
            index += SEQUENCE_LENGTH
            count += 1
        else:  # Move index to the start of the next 
            # This is the discarding section, can be modified to be "imputed" via extending the last known close value until end of day.
            # Fill forward
            newindex = index
            while dates[newindex].date() == dates[newindex + 1].date():
                newindex += 1
            newindex += 1
            index = newindex
    print("Valid days:", count)
    return np.array(xs), np.array(ys)

# One day sequence length
SEQUENCE_LENGTH = 78
PREDICTION_LENGTH = 78
x, y = create_sequences(normalized_data, SEQUENCE_LENGTH, PREDICTION_LENGTH) #Creating the input and grouth truth data from create_sequences function



# Training Pipeline

In [None]:

# Split data into train and test sets
train_size = int(len(x) * 0.8) #Splitting the data into 80%-20% training and validation splits
x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Type adjustment
x_train = x_train.astype(np.float32)
y_train = y_train.astype(np.float32)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(SEQUENCE_LENGTH, x_train.shape[2]))) 
model.add(LSTM(50)) # Need both layers because return_sequences will send its output to another LSTM layer which is required before sending to Dense layer
model.add(Dense(40, activation='relu')) # Makes readable by NN, NN doesn't predict on sequences so it needs single dimension values
model.add(Dropout(0.1)) # Prevents overfitting
model.add(Dense(PREDICTION_LENGTH)) # Takes the results from the last LSTM layer and predicts the stock prices for PREDICTION_LENGTH steps ahead
model.compile(optimizer='adam', loss='mse') #Compiles the model with an adam optimizer and a mean squared error loss function

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
model.fit(x_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

In [None]:
model.save('model.h5') #Saving the model weights to an external file