In [2]:
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np 
import seaborn as sns
import os
import glob 

In [3]:
%matplotlib inline

In [4]:
# Stock to be used 
# MSFT - Microsoft 
# AAPL - Apple
# AMZN - Amazon

# Get CSV file 
files = os.listdir('FinancialDatasets/stocks')
print("Files: ", files)

print("\n")
# Stock Data for Apple 
AAPL_File_Path = os.path.join('FinancialDatasets/stocks', 'AAPL.csv')
AAPLStock = pd.read_csv(AAPL_File_Path, header=0, index_col=0, parse_dates=True)

#Clean data and remove old data pror to 2022 
AAPLStock.index = pd.to_datetime(AAPLStock.index, utc=True)
AAPLStock = AAPLStock.drop(AAPLStock[AAPLStock.index < '2022'].index)

print("Apple Stock Data :\n", AAPLStock)
print("\n")

# Stock Data for Amazon 
AMZN_File_Path = os.path.join('FinancialDatasets/stocks', 'AMZN.csv')
AMZNStock = pd.read_csv(AMZN_File_Path, header=0, index_col=0, parse_dates=True)

#Clean data and remove old data pror to 2022 
AMZNStock.index = pd.to_datetime(AMZNStock.index, utc=True)
AMZNStock = AMZNStock.drop(AMZNStock[AMZNStock.index < '2022'].index)
print("Amazon Stock Data :\n", AMZNStock)


# Stock Data for Microsoft
MSFT_File_Path = os.path.join('FinancialDatasets/stocks', 'MSFT.csv')
MSFTStock = pd.read_csv(MSFT_File_Path, header = 0, index_col=0, parse_dates=True)

# Now we will convert the date column and drop all dates prior to 2022
MSFTStock.index = pd.to_datetime(MSFTStock.index, utc=True)
MSFTStock = MSFTStock.drop(MSFTStock[MSFTStock.index < '2022'].index)

print("Microsoft Data: ", MSFTStock)

# Using 3 csv files containing stock market data of Apple, Microsoft and Amazon
# Each dataset will be used to predict future prices using the AutoRegressive Integreated Moving Average Model.

Files:  ['AAPL.csv', 'AMZN.csv', 'archive', 'MSFT.csv']


Apple Stock Data :
                                  Open        High         Low       Close  \
Date                                                                        
2022-01-03 05:00:00+00:00  176.052761  181.052294  175.933965  180.190979   
2022-01-04 05:00:00+00:00  180.804778  181.111677  177.329848  177.904053   
2022-01-05 05:00:00+00:00  177.814944  178.369344  172.894614  173.171814   
2022-01-06 05:00:00+00:00  170.973992  173.548013  169.924588  170.280991   
2022-01-07 05:00:00+00:00  171.162116  172.399623  169.320704  170.449310   
...                               ...         ...         ...         ...   
2023-09-15 04:00:00+00:00  176.479996  176.500000  173.820007  175.009995   
2023-09-18 04:00:00+00:00  176.479996  179.380005  176.169998  177.970001   
2023-09-19 04:00:00+00:00  177.520004  179.630005  177.130005  179.070007   
2023-09-20 04:00:00+00:00  179.259995  179.699997  175.399994  175.490005  

In [5]:
AMZNStock.info()
print("\n")
AAPLStock.info()
print("\n")
MSFTStock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 432 entries, 2022-01-03 05:00:00+00:00 to 2023-09-21 04:00:00+00:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          432 non-null    float64
 1   High          432 non-null    float64
 2   Low           432 non-null    float64
 3   Close         432 non-null    float64
 4   Volume        432 non-null    int64  
 5   Dividends     432 non-null    float64
 6   Stock Splits  432 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 27.0 KB


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 432 entries, 2022-01-03 05:00:00+00:00 to 2023-09-21 04:00:00+00:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          432 non-null    float64
 1   High          432 non-null    float64
 2   Low           432 non-null    float64
 3   Close         432 non-null    floa

In [6]:
# Create our predictable variables 
AAPLStock['Open-Close'] = AAPLStock.Open - AAPLStock.Close
AAPLStock['High-Low'] = AAPLStock.High - AAPLStock.Low

#Store our predictor variables 
x = AAPLStock[['Open-Close', 'High-Low']]
print(x.head)

<bound method NDFrame.head of                            Open-Close  High-Low
Date                                           
2022-01-03 05:00:00+00:00   -4.138218  5.118329
2022-01-04 05:00:00+00:00    2.900725  3.781830
2022-01-05 05:00:00+00:00    4.643130  5.474731
2022-01-06 05:00:00+00:00    0.693001  3.623425
2022-01-07 05:00:00+00:00    0.712805  3.078919
...                               ...       ...
2023-09-15 04:00:00+00:00    1.470001  2.679993
2023-09-18 04:00:00+00:00   -1.490005  3.210007
2023-09-19 04:00:00+00:00   -1.550003  2.500000
2023-09-20 04:00:00+00:00    3.769989  4.300003
2023-09-21 04:00:00+00:00   -0.565002  1.801300

[432 rows x 2 columns]>


In [7]:
# Target variables: Outcome of the machine learning model 
# Will predict based on the explanatory variables. 

# 1 - This will tell us to buy 
# 0 - tells us where to sell or not buy
y = np.where(AAPLStock['Close'].shift(-1) > AAPLStock['Close'], 1, 0)
print(y)

[0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 1
 1 0 1 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0
 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 0 0 0 0
 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0
 0 1 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 1 1 0
 0 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 1
 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 1 0 1
 1 0 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 1 1 0 1 0
 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0
 0 0 1 0 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0
 1 1 1 1 0 0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0
 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 0 1 0 1 1 0 0 0]


In [8]:
# When training our Machine Learning models we need to 
# split our dataset into training and testing sets. 
# This will be split 80/20 (80% Training, 20% Testing)
split_percent = 0.8
split = int(split_percent*len(AAPLStock))

# Train Dataset 
x_train = x[:split]
y_train = y[:split]

# Get our Test dataset 
x_test = x[split:]
y_test = y[split:]