<a href="https://colab.research.google.com/github/carlosvalenciano/Project4/blob/main/Stock_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stock price prediction using LSTM neural network and Tensorflow
What do we need here:
1. Load data
2. Scale data for machine learning model
3. Setup neural network
4. Compile model
5. Teach neural netowk and fit this
6. Use the model for prediction
7. Draw the results chart

In [38]:
# Requirements
!pip install yahoo_fin



In [63]:
import numpy as np
import time as tm
import datetime as dt
import tensorflow as tf
import pandas as pd

# Data preparation
from yahoo_fin import stock_info as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from collections import deque

# AI
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

# For time stamps
from datetime import datetime



# Graphics library
import matplotlib.pyplot as plt

In [89]:
# SETTINGS

# Window size or the sequence length, 7 (1 week)
N_STEPS = 7

# Lookup steps, 1 is the next day, 3 = after tomorrow
LOOKUP_STEPS = [1, 2, 3]

# Stock ticker, GOOGL
stocks = 'GOOGL'


# Current date
date_now = tm.strftime('%Y-%m-%d')
date_3_years_back = (dt.date.today() - dt.timedelta(days=1104)).strftime('%Y-%m-%d')

In [65]:
# LOAD DATA
# from yahoo_fin
# for 1104 bars with interval = 1d (one day)
init_df = yf.get_data(
    stocks,
    start_date=date_3_years_back,
    end_date=date_now,
    interval='1d')

In [66]:
init_df

Unnamed: 0,open,high,low,close,adjclose,volume,ticker
2020-12-01,88.333000,91.085999,88.151497,89.767998,89.767998,37350000,GOOGL
2020-12-02,89.767998,91.637001,89.258499,91.248497,91.248497,29424000,GOOGL
2020-12-03,91.027000,92.191498,90.849998,91.092003,91.092003,24728000,GOOGL
2020-12-04,91.011002,91.474998,90.679497,91.188004,91.188004,20544000,GOOGL
2020-12-07,90.777496,91.464500,90.152000,90.851501,90.851501,22288000,GOOGL
...,...,...,...,...,...,...,...
2023-12-04,129.880005,130.029999,127.900002,129.270004,129.270004,36669900,GOOGL
2023-12-05,128.949997,132.139999,128.250000,130.990005,130.990005,27384800,GOOGL
2023-12-06,131.440002,131.839996,129.880005,130.020004,130.020004,23576200,GOOGL
2023-12-07,135.039993,138.559998,134.699997,136.929993,136.929993,56767100,GOOGL


In [68]:
df_filter = init_df["adjclose"].pct_change() * 100
df_filter = df_filter.rename("Today")
df_filter = df_filter.reset_index()
df_filter["Volume"] = init_df["volume"].shift(1).values / 1000_000_000

In [7]:
df_filter

Unnamed: 0,index,Today,Volume
0,2020-12-01,,
1,2020-12-02,1.649251,0.037350
2,2020-12-03,-0.171503,0.029424
3,2020-12-04,0.105389,0.024728
4,2020-12-07,-0.369020,0.020544
...,...,...,...
756,2023-12-04,-1.964202,0.031431
757,2023-12-05,1.330549,0.036670
758,2023-12-06,-0.740515,0.027385
759,2023-12-07,5.314558,0.023576


In [69]:
for i in range(1,6):
    df_filter["Lag " + str(i)] = df_filter["Today"].shift(i)

df_filter

Unnamed: 0,index,Today,Volume,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5
0,2020-12-01,,,,,,,
1,2020-12-02,1.649251,0.037350,,,,,
2,2020-12-03,-0.171503,0.029424,1.649251,,,,
3,2020-12-04,0.105389,0.024728,-0.171503,1.649251,,,
4,2020-12-07,-0.369020,0.020544,0.105389,-0.171503,1.649251,,
...,...,...,...,...,...,...,...,...
756,2023-12-04,-1.964202,0.031431,-0.505545,-1.822362,-1.610781,0.579131,-0.204842
757,2023-12-05,1.330549,0.036670,-1.964202,-0.505545,-1.822362,-1.610781,0.579131
758,2023-12-06,-0.740515,0.027385,1.330549,-1.964202,-0.505545,-1.822362,-1.610781
759,2023-12-07,5.314558,0.023576,-0.740515,1.330549,-1.964202,-0.505545,-1.822362


In [70]:
df_filter = df_filter.dropna()
df_filter

Unnamed: 0,index,Today,Volume,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5
6,2020-12-09,-1.847814,0.019936,-0.313703,-0.369020,0.105389,-0.171503,1.649251
7,2020-12-10,-0.574284,0.031728,-1.847814,-0.313703,-0.369020,0.105389,-0.171503
8,2020-12-11,0.404490,0.028688,-0.574284,-1.847814,-0.313703,-0.369020,0.105389
9,2020-12-14,-1.270001,0.018628,0.404490,-0.574284,-1.847814,-0.313703,-0.369020
10,2020-12-15,0.503352,0.033050,-1.270001,0.404490,-0.574284,-1.847814,-0.313703
...,...,...,...,...,...,...,...,...
756,2023-12-04,-1.964202,0.031431,-0.505545,-1.822362,-1.610781,0.579131,-0.204842
757,2023-12-05,1.330549,0.036670,-1.964202,-0.505545,-1.822362,-1.610781,0.579131
758,2023-12-06,-0.740515,0.027385,1.330549,-1.964202,-0.505545,-1.822362,-1.610781
759,2023-12-07,5.314558,0.023576,-0.740515,1.330549,-1.964202,-0.505545,-1.822362


In [71]:
df_filter["Direction"] = [1 if i > 0 else 0 for i in df_filter["Today"]]
df_filter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter["Direction"] = [1 if i > 0 else 0 for i in df_filter["Today"]]


Unnamed: 0,index,Today,Volume,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5,Direction
6,2020-12-09,-1.847814,0.019936,-0.313703,-0.369020,0.105389,-0.171503,1.649251,0
7,2020-12-10,-0.574284,0.031728,-1.847814,-0.313703,-0.369020,0.105389,-0.171503,0
8,2020-12-11,0.404490,0.028688,-0.574284,-1.847814,-0.313703,-0.369020,0.105389,1
9,2020-12-14,-1.270001,0.018628,0.404490,-0.574284,-1.847814,-0.313703,-0.369020,0
10,2020-12-15,0.503352,0.033050,-1.270001,0.404490,-0.574284,-1.847814,-0.313703,1
...,...,...,...,...,...,...,...,...,...
756,2023-12-04,-1.964202,0.031431,-0.505545,-1.822362,-1.610781,0.579131,-0.204842,0
757,2023-12-05,1.330549,0.036670,-1.964202,-0.505545,-1.822362,-1.610781,0.579131,1
758,2023-12-06,-0.740515,0.027385,1.330549,-1.964202,-0.505545,-1.822362,-1.610781,0
759,2023-12-07,5.314558,0.023576,-0.740515,1.330549,-1.964202,-0.505545,-1.822362,1


In [72]:
X = df_filter[["Lag 1", "Lag 2", "Lag 3", "Lag 4", "Lag 5", "Volume"]]
y = df_filter[["Direction"]]

X_train = X[0:550]
X_test = X[550:]

y_train = y[0:550]
y_test = y[550:]

clf = LogisticRegression().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [73]:
clf.score(X_test, y_test)

0.5317073170731708

In [74]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_filter['Direction']

# Separate the X variable, the features
X = df_filter.drop(columns=['Direction', 'index', 'Today'])

In [75]:
# Review the y variable Series
y.head()

6     0
7     0
8     1
9     0
10    1
Name: Direction, dtype: int64

In [76]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Volume,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5
6,0.019936,-0.313703,-0.36902,0.105389,-0.171503,1.649251
7,0.031728,-1.847814,-0.313703,-0.36902,0.105389,-0.171503
8,0.028688,-0.574284,-1.847814,-0.313703,-0.36902,0.105389
9,0.018628,0.40449,-0.574284,-1.847814,-0.313703,-0.36902
10,0.03305,-1.270001,0.40449,-0.574284,-1.847814,-0.313703


In [77]:
# Check the balance of our target values
y.value_counts()

1    392
0    363
Name: Direction, dtype: int64

In [78]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [79]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(random_state=1)


# Fit the model using training data
lr_model.fit(X_train, y_train)

In [80]:
# Make a prediction using the testing data
testing_predictions = lr_model.predict(X_test)

In [81]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, testing_predictions)

0.5161290322580645

In [82]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, testing_predictions)

array([[34, 59],
       [32, 64]])

In [101]:
# Print the classification report for the model
target_names = ["Down", "Up"]
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

        Down       0.52      0.37      0.43        93
          Up       0.52      0.67      0.58        96

    accuracy                           0.52       189
   macro avg       0.52      0.52      0.51       189
weighted avg       0.52      0.52      0.51       189

