# Step1: Importing the libraries

In [69]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
#from xgboost import XGBClassifier
from matplotlib import dates
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [70]:
# load the data
data = pd.read_csv('15 Years Stock Data of NVDA AAPL MSFT GOOGL and AMZN.csv', parse_dates=['Date'])
data.sort_values('Date', inplace=True, ascending=True)
#data.set_index('Date', inplace=True)
#data['id'] = data.index
data

Unnamed: 0,Date,Close_AAPL,Close_AMZN,Close_GOOGL,Close_MSFT,Close_NVDA,High_AAPL,High_AMZN,High_GOOGL,High_MSFT,...,Open_AAPL,Open_AMZN,Open_GOOGL,Open_MSFT,Open_NVDA,Volume_AAPL,Volume_AMZN,Volume_GOOGL,Volume_MSFT,Volume_NVDA
0,2010-01-04,6.440330,6.695000,15.609805,23.254051,0.423884,6.455075,6.830500,15.678546,23.366752,...,6.422875,6.812500,15.614786,23.006108,0.424342,493729600,151998000,78169752,38409100,800204000
1,2010-01-05,6.451465,6.734500,15.541064,23.261557,0.430073,6.487878,6.774000,15.636953,23.366746,...,6.458086,6.671500,15.620515,23.178910,0.422279,601904800,177038000,120067812,49749600,728648000
2,2010-01-06,6.348847,6.612500,15.149294,23.118809,0.432824,6.477046,6.736500,15.587638,23.351725,...,6.451466,6.730000,15.587638,23.201455,0.429844,552160000,143576000,158988852,58182400,649168000
3,2010-01-07,6.337109,6.500000,14.796624,22.878376,0.424342,6.379842,6.616000,15.192630,23.066212,...,6.372319,6.600500,15.177685,23.013616,0.430532,477131200,220604000,256315428,50559700,547792000
4,2010-01-08,6.379242,6.676000,14.993881,23.036165,0.425259,6.379844,6.684000,15.024515,23.201460,...,6.328685,6.528000,14.744323,22.750656,0.420903,447610800,196610000,188783028,51197400,478168000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769,2024-12-24,257.916443,229.050003,195.884399,438.450836,140.207108,257.926411,229.139999,195.884399,438.720315,...,255.209412,226.940002,194.615856,433.780209,139.987127,23234700,15007500,10403300,7164500,105157000
3770,2024-12-26,258.735504,227.050003,195.375000,437.233276,139.917130,259.814335,228.500000,196.523671,440.057630,...,257.906429,228.500000,194.925505,438.201337,139.687155,27237100,16146700,12046600,8194200,116205600
3771,2024-12-27,255.309296,223.750000,192.538254,429.668457,136.997391,258.415896,226.029999,195.095322,434.349074,...,257.546826,225.600006,194.725737,433.730320,138.537258,42355300,27367100,18891400,18117700,170582600
3772,2024-12-30,251.923019,221.300003,191.020004,423.979858,137.477356,253.221595,223.000000,192.328495,426.694417,...,251.952985,220.059998,189.581658,425.207408,134.817597,35557500,28321200,14264700,13158700,167734700


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3774 entries, 0 to 3773
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          3774 non-null   datetime64[ns]
 1   Close_AAPL    3774 non-null   float64       
 2   Close_AMZN    3774 non-null   float64       
 3   Close_GOOGL   3774 non-null   float64       
 4   Close_MSFT    3774 non-null   float64       
 5   Close_NVDA    3774 non-null   float64       
 6   High_AAPL     3774 non-null   float64       
 7   High_AMZN     3774 non-null   float64       
 8   High_GOOGL    3774 non-null   float64       
 9   High_MSFT     3774 non-null   float64       
 10  High_NVDA     3774 non-null   float64       
 11  Low_AAPL      3774 non-null   float64       
 12  Low_AMZN      3774 non-null   float64       
 13  Low_GOOGL     3774 non-null   float64       
 14  Low_MSFT      3774 non-null   float64       
 15  Low_NVDA      3774 non-null   float64 

In [72]:
data.isnull().sum()

Date            0
Close_AAPL      0
Close_AMZN      0
Close_GOOGL     0
Close_MSFT      0
Close_NVDA      0
High_AAPL       0
High_AMZN       0
High_GOOGL      0
High_MSFT       0
High_NVDA       0
Low_AAPL        0
Low_AMZN        0
Low_GOOGL       0
Low_MSFT        0
Low_NVDA        0
Open_AAPL       0
Open_AMZN       0
Open_GOOGL      0
Open_MSFT       0
Open_NVDA       0
Volume_AAPL     0
Volume_AMZN     0
Volume_GOOGL    0
Volume_MSFT     0
Volume_NVDA     0
dtype: int64

# Step 3: Feature Engineering
Add technical indications as new feature that will make the model more accurate and the results more insightful.
The technical indicators are:
* The daily return on investment
* The average closing price over the last n days (Simple Moving Average)
* The exponential average price which gives more weight to recent prices, so it reacts faster than SMA. (Exponential Moving Average)
* The volatility, which is the standard deviation of the return
* The momentum, which is the difference between today's price and the price from n days ago.
* The target is added for binary classification and is telling the model what to learn.


In [73]:
# Calculate the daily return of each stock
data['Return_AAPL'] = data['Close_AAPL'].pct_change()
data['Return_AMZN'] = data['Close_AMZN'].pct_change()
data['Return_GOOGL'] = data['Close_GOOGL'].pct_change()
data['Return_MSFT'] = data['Close_MSFT'].pct_change()
data['Return_NVDA'] = data['Close_NVDA'].pct_change()
data

Unnamed: 0,Date,Close_AAPL,Close_AMZN,Close_GOOGL,Close_MSFT,Close_NVDA,High_AAPL,High_AMZN,High_GOOGL,High_MSFT,...,Volume_AAPL,Volume_AMZN,Volume_GOOGL,Volume_MSFT,Volume_NVDA,Return_AAPL,Return_AMZN,Return_GOOGL,Return_MSFT,Return_NVDA
0,2010-01-04,6.440330,6.695000,15.609805,23.254051,0.423884,6.455075,6.830500,15.678546,23.366752,...,493729600,151998000,78169752,38409100,800204000,,,,,
1,2010-01-05,6.451465,6.734500,15.541064,23.261557,0.430073,6.487878,6.774000,15.636953,23.366746,...,601904800,177038000,120067812,49749600,728648000,0.001729,0.005900,-0.004404,0.000323,0.014603
2,2010-01-06,6.348847,6.612500,15.149294,23.118809,0.432824,6.477046,6.736500,15.587638,23.351725,...,552160000,143576000,158988852,58182400,649168000,-0.015906,-0.018116,-0.025209,-0.006137,0.006396
3,2010-01-07,6.337109,6.500000,14.796624,22.878376,0.424342,6.379842,6.616000,15.192630,23.066212,...,477131200,220604000,256315428,50559700,547792000,-0.001849,-0.017013,-0.023280,-0.010400,-0.019597
4,2010-01-08,6.379242,6.676000,14.993881,23.036165,0.425259,6.379844,6.684000,15.024515,23.201460,...,447610800,196610000,188783028,51197400,478168000,0.006649,0.027077,0.013331,0.006897,0.002161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769,2024-12-24,257.916443,229.050003,195.884399,438.450836,140.207108,257.926411,229.139999,195.884399,438.720315,...,23234700,15007500,10403300,7164500,105157000,0.011478,0.017729,0.007604,0.009374,0.003938
3770,2024-12-26,258.735504,227.050003,195.375000,437.233276,139.917130,259.814335,228.500000,196.523671,440.057630,...,27237100,16146700,12046600,8194200,116205600,0.003176,-0.008732,-0.002601,-0.002777,-0.002068
3771,2024-12-27,255.309296,223.750000,192.538254,429.668457,136.997391,258.415896,226.029999,195.095322,434.349074,...,42355300,27367100,18891400,18117700,170582600,-0.013242,-0.014534,-0.014519,-0.017302,-0.020868
3772,2024-12-30,251.923019,221.300003,191.020004,423.979858,137.477356,253.221595,223.000000,192.328495,426.694417,...,35557500,28321200,14264700,13158700,167734700,-0.013263,-0.010950,-0.007885,-0.013240,0.003503


In [74]:
# Calculating the average closing price over the last n days
data['SMA_10_AAPL'] = data['Close_AAPL'].rolling(window=10).mean()
data['SMA_50_AAPL'] = data['Close_AAPL'].rolling(window=50).mean()
data['SMA_10_AMZN'] = data['Close_AMZN'].rolling(window=10).mean()
data['SMA_50_AMZN'] = data['Close_AMZN'].rolling(window=50).mean()
data['SMA_10_GOOGL'] = data['Close_GOOGL'].rolling(window=10).mean()
data['SMA_50_GOOGL'] = data['Close_GOOGL'].rolling(window=50).mean()
data['SMA_10_MSFT'] = data['Close_MSFT'].rolling(window=10).mean()
data['SMA_50_MSFT'] = data['Close_MSFT'].rolling(window=50).mean()
data['SMA_10_NVDA'] = data['Close_NVDA'].rolling(window=10).mean()
data['SMA_50_NVDA'] = data['Close_NVDA'].rolling(window=50).mean()

In [75]:
# Calculating the Exponential Moving Average
data['EMA_20_AAPL'] = data['Close_AAPL'].ewm(span=20).mean()
data['EMA_20_AMZN'] = data['Close_AMZN'].ewm(span=20).mean()
data['EMA_20_GOOGL'] = data['Close_GOOGL'].ewm(span=20).mean()
data['EMA_20_MSFT'] = data['Close_MSFT'].ewm(span=20).mean()
data['EMA_20_NVDA'] = data['Close_NVDA'].ewm(span=20).mean()
print(data[data['SMA_50_AAPL'].isnull()])

         Date  Close_AAPL  Close_AMZN  Close_GOOGL  Close_MSFT  Close_NVDA  \
0  2010-01-04    6.440330      6.6950    15.609805   23.254051    0.423884   
1  2010-01-05    6.451465      6.7345    15.541064   23.261557    0.430073   
2  2010-01-06    6.348847      6.6125    15.149294   23.118809    0.432824   
3  2010-01-07    6.337109      6.5000    14.796624   22.878376    0.424342   
4  2010-01-08    6.379242      6.6760    14.993881   23.036165    0.425259   
5  2010-01-11    6.322966      6.5155    14.971216   22.743141    0.419299   
6  2010-01-12    6.251042      6.3675    14.706467   22.592865    0.405085   
7  2010-01-13    6.339215      6.4555    14.622035   22.803244    0.410587   
8  2010-01-14    6.302503      6.3675    14.690776   23.261557    0.404168   
9  2010-01-15    6.197175      6.3570    14.445452   23.186430    0.392247   
10 2010-01-19    6.471327      6.3805    14.635235   23.366755    0.399583   
11 2010-01-20    6.371718      6.2890    14.455662   22.983562  

In [79]:
# Calculate the volatility of return
data['Volatility_AAPL'] = data['Return_AAPL'].rolling(window=10).std()
data['Volatility_AMZN'] = data['Return_AMZN'].rolling(window=10).std()
data['Volatility_GOOGL'] = data['Return_GOOGL'].rolling(window=10).std()
data['Volatility_MSFT'] = data['Return_MSFT'].rolling(window=10).std()
data['Volatility_NVDA'] = data['Return_NVDA'].rolling(window=10).std()
print(data[data['Volatility_GOOGL'].isnull()])

        Date  Close_AAPL  Close_AMZN  Close_GOOGL  Close_MSFT  Close_NVDA  \
0 2010-01-04    6.440330      6.6950    15.609805   23.254051    0.423884   
1 2010-01-05    6.451465      6.7345    15.541064   23.261557    0.430073   
2 2010-01-06    6.348847      6.6125    15.149294   23.118809    0.432824   
3 2010-01-07    6.337109      6.5000    14.796624   22.878376    0.424342   
4 2010-01-08    6.379242      6.6760    14.993881   23.036165    0.425259   
5 2010-01-11    6.322966      6.5155    14.971216   22.743141    0.419299   
6 2010-01-12    6.251042      6.3675    14.706467   22.592865    0.405085   
7 2010-01-13    6.339215      6.4555    14.622035   22.803244    0.410587   
8 2010-01-14    6.302503      6.3675    14.690776   23.261557    0.404168   
9 2010-01-15    6.197175      6.3570    14.445452   23.186430    0.392247   

   High_AAPL  High_AMZN  High_GOOGL  High_MSFT  ...  EMA_20_AAPL  EMA_20_AMZN  \
0   6.455075     6.8305   15.678546  23.366752  ...     6.440330     6.

In [None]:
# Calculate the momentum for each stock
data['Momentum_AAPL'] = data['Close_AAPL'] - data['Close_AAPL'].shift(10)
data['Momentum_AMZN'] = data['Close_AMZN'] - data['Close_AMZN'].shift(10)
data['Momentum_GOOGL'] = data['Close_GOOGL'] - data['Close_GOOGL'].shift(10)
data['Momentum_MSFT'] = data['Close_MSFT'] - data['Close_MSFT'].shift(10)
data['Momentum_NVDA'] = data['Close_NVDA'] - data['Close_NVDA'].shift(10)
