In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'vanguard-500-index-fund-voo:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3970846%2F6914911%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240513%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240513T022123Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D72affd18962b315096d24d6a9eeb0c337feeb3b75ab2ee3b6dddb8ec16cd2fbccc4680ca2a4eab889907a85416a4a28f5d0f26ddbfa79b438b0fc96b83ba16deb09fbaea6d83530fa5c2d8b7bfc8e15493b7038531efc0807ef317dc5de1f2b9fff96e11a824962d8ca460f284bc24a33c04871a23bf3b14219d58a27e40bce5533355f6baecfa6e9dc2dc94cd3bcc91f9f80aef571d536d3361a1d28c56e429b70ea8c658f64819df4b907275dcff50325142b42eec50261377b391a5f2e43634b96bcbdcf2762e59b861400c350f7801988f573d72b263f96832481082915006a766119e7f847a06b37cd2a7b1a902fbccbdc3d51bf13de015d49f38d3cd72'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vanguard-500-index-fund-voo/VOO Stock Data.csv


# install necessary packages

In [None]:
!pip install numpy pandas matplotlib scikit-learn tensorflow

  pid, fd = os.forkpty()




# Read the data

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/vanguard-500-index-fund-voo/VOO Stock Data.csv')
df.head(10)
print(df.head(10))
print(df.describe())
print(df.dtypes)

       Date    Open      High      Low   Close     Volume
0  11/07/23  400.26  402.0400  399.230  401.34  3352630.0
1  11/06/23  400.12  400.7330  398.460  400.21  4007105.0
2  11/03/23  397.92  400.8500  397.840  399.44  5981435.0
3  11/02/23  391.93  395.9096  391.920  395.78  4752164.0
4  11/01/23  385.09  389.0800  384.620  388.40  5549159.0
5  10/31/23  382.35  384.4800  380.560  384.17  4523160.0
6  10/30/23  379.91  382.8000  378.710  381.86  5795761.0
7  10/27/23  380.53  380.8900  375.945  377.32  5638752.0
8  10/26/23  382.63  383.3971  378.150  379.00  6959401.0
9  10/25/23  387.60  387.6800  383.120  383.71  5471214.0
              Open         High          Low        Close        Volume
count  3314.000000  3314.000000  3314.000000  3314.000000  3.314000e+03
mean    237.215078   238.458956   235.841214   237.242470  2.546310e+06
std      94.659605    95.271351    94.002811    94.668877  2.360743e+06
min      99.140000   101.860000    98.240000   100.340000  8.638000e+03
25

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values('Date', inplace=True)

  df['Date'] = pd.to_datetime(df['Date'])


In [None]:
df.dtypes

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object

In [None]:
print(df.isnull().sum())

Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64


In [None]:
df.dropna(inplace=True)

In [None]:
# Use 'Date' as a numeric feature by converting it to the number of days since the start date
df['Time'] = (df['Date'] - df['Date'].min()).dt.days

In [None]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Time
3313,2010-09-09,102.5,102.5,101.14,101.32,26513.0,0
3312,2010-09-10,101.68,101.86,101.296,101.78,8638.0,1
3311,2010-09-13,102.96,103.14,102.5,103.06,33752.5,4
3310,2010-09-14,102.84,103.48,102.38,103.038,59420.0,5
3309,2010-09-15,102.62,103.38,102.4,103.3,9283.0,6
3308,2010-09-16,103.02,103.32,102.7,103.26,59580.5,7
3307,2010-09-17,103.88,103.88,103.02,103.36,49365.0,8
3306,2010-09-20,103.74,105.04,103.46,105.04,19006.0,11
3305,2010-09-21,105.02,105.44,104.28,104.7198,19286.5,12
3304,2010-09-22,104.56,105.04,103.96,104.16,18148.0,13


In [None]:
df.tail(-10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Time
3303,2010-09-23,103.320,104.3400,103.120,103.3062,33358.0,14
3302,2010-09-24,103.940,104.9000,103.868,104.8400,24126.5,15
3301,2010-09-27,104.980,104.9800,104.340,104.3400,15044.5,18
3300,2010-09-28,104.318,105.0200,103.420,105.0200,49853.0,19
3299,2010-09-29,104.580,104.9100,104.220,104.5200,15890.0,20
...,...,...,...,...,...,...,...
4,2023-11-01,385.090,389.0800,384.620,388.4000,5549159.0,4801
3,2023-11-02,391.930,395.9096,391.920,395.7800,4752164.0,4802
2,2023-11-03,397.920,400.8500,397.840,399.4400,5981435.0,4803
1,2023-11-06,400.120,400.7330,398.460,400.2100,4007105.0,4806


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


X = np.array(df['Time']).reshape(-1, 1)  # Reshape for sklearn compatibility
y = np.array(df['Close']).reshape(-1, 1)

print("Minimum value in y:", np.min(y))
print("Maximum value in y:", np.max(y))

print(X)
print(y)

# Normalize the features
scaler_x = MinMaxScaler(feature_range=(0, 1))
print(scaler_x)
scaler_y = MinMaxScaler(feature_range=(0, 1))
print(scaler_x)
X_scaled = scaler_x.fit_transform(X)
print(X_scaled)
y_scaled = scaler_y.fit_transform(y)
print(y_scaled)
print(y_scaled[0])
print(y_scaled[len(y_scaled)-1])

print("Scaled minimum:", np.min(y_scaled))
print("Scaled maximum:", np.max(y_scaled))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
print(X_train)


Minimum value in y: 100.34
Maximum value in y: 439.25
[[   0]
 [   1]
 [   4]
 ...
 [4803]
 [4806]
 [4807]]
[[101.32]
 [101.78]
 [103.06]
 ...
 [399.44]
 [400.21]
 [401.34]]
MinMaxScaler()
MinMaxScaler()
[[0.00000000e+00]
 [2.08029956e-04]
 [8.32119825e-04]
 ...
 [9.99167880e-01]
 [9.99791970e-01]
 [1.00000000e+00]]
[[0.00289162]
 [0.00424892]
 [0.00802573]
 ...
 [0.88253519]
 [0.88480718]
 [0.88814139]]
[0.00289162]
[0.88814139]
Scaled minimum: 0.0
Scaled maximum: 1.0000000000000002
[[0.44060745]
 [0.16101519]
 [0.51716247]
 ...
 [0.3902642 ]
 [0.26003745]
 [0.95818598]]


#

# Multilayer Perceptron (MLP) Simple Dense Predictor

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the model
model = Sequential([
    Dense(50, input_dim=1, activation='relu'),
    Dense(20, activation='relu'),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')


In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred_scaled = model.predict(X_test)

# Inverse transform predictions
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_actual = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
print("RMSE: ", rmse)

RMSE:  14.248296640758788


In [None]:
print(X.max())
future_times = np.array([X.max() + 365 * i for i in range(1, 11)]).reshape(-1, 1)
future_times_scaled = scaler_x.transform(future_times)
future_predictions_scaled = model.predict(future_times_scaled)
future_predictions = scaler_y.inverse_transform(future_predictions_scaled)

print("Future Predictions for the next 10 years:")
print(future_predictions)

4807
Future Predictions for the next 10 years:
[[359.04218]
 [349.22757]
 [339.41296]
 [329.59842]
 [319.7838 ]
 [309.96924]
 [299.4777 ]
 [289.0683 ]
 [278.6192 ]
 [268.17007]]


# Leaky ReLU

In [None]:
from tensorflow.keras.layers import LeakyReLU, Dense
from tensorflow.keras.models import Sequential

# Build the model
model = Sequential([
    Dense(50, input_dim=1),
    LeakyReLU(alpha=0.01),  # Specify LeakyReLU as a layer with a small slope for negative inputs
    Dense(20),
    LeakyReLU(alpha=0.01),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1, validation_split=0.2)
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred_scaled = model.predict(X_test)

# Inverse transform predictions
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_actual = scaler_y.inverse_transform(y_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
print("RMSE: ", rmse)
print(X.max())
future_times = np.array([X.max() + 365 * i for i in range(1, 11)]).reshape(-1, 1)
future_times_scaled = scaler_x.transform(future_times)
future_predictions_scaled = model.predict(future_times_scaled)
future_predictions = scaler_y.inverse_transform(future_predictions_scaled)

print("Future Predictions for the next 10 years:")
print(future_predictions)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
RMSE:  15.720786758934489
4807
Future Predictions for the next 10 years:
[[400.10913]
 [404.79523]
 [409.64243]
 [414.12088]
 [417.55313]
 [418.7763 ]
 [419.99957]
 [421.22275]
 [422.44595]
 [423.6692 ]]


# LSTM RNN Neural network training

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import numpy as np
# Assuming X_train, y_train are properly scaled and ready for training
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, 1)),
    LSTM(50),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

model.fit(X_train_lstm, y_train, epochs=50, batch_size=10, verbose=1, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7cc20cfd7f10>

# Evaluation of the model

In [None]:
# Predicting with test data
y_pred_scaled = model.predict(X_test_lstm)

# Inverse transform the predicted values to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_actual = scaler_y.inverse_transform(y_test.reshape(-1, 1))  # Ensure y_test is properly reshaped for inverse transform

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
print("RMSE on Test Data: ", rmse)

RMSE on Test Data:  21.727497487365362


# Prediction of the model

In [None]:
future_times_scaled = scaler_x.transform(future_times.reshape(-1, 1))
future_times_scaled_lstm = future_times_scaled.reshape((future_times_scaled.shape[0], 1, future_times_scaled.shape[1]))
future_predictions_scaled = model.predict(future_times_scaled_lstm)
future_predictions = scaler_y.inverse_transform(future_predictions_scaled)

print("Future Predictions for the next 10 years:")
print(future_predictions)


Future Predictions for the next 10 years:
[[449.26926]
 [487.81937]
 [527.89124]
 [569.3219 ]
 [611.932  ]
 [655.5306 ]
 [699.9181 ]
 [744.8916 ]
 [790.2486 ]
 [835.7903 ]]


In [None]:
import pandas as pd

df['Date'] = pd.to_datetime(df['Date'])  # Convert Date column to datetime if not already

# Calculate the maximum date and add one year
max_date = df['Date'].max()
start_date = pd.Timestamp(year=max_date.year - 1, month=1, day=1)

# Create a DataFrame for the predicted values
years = pd.date_range(start=start_date + pd.DateOffset(years=1), periods=10, freq='Y')
future_df = pd.DataFrame({
    'Year': years.year,
    'Predicted Close': [x[0] for x in future_predictions]
})

# Display the DataFrame
print(future_df)


   Year  Predicted Close
0  2023       449.269257
1  2024       487.819366
2  2025       527.891235
3  2026       569.321899
4  2027       611.932007
5  2028       655.530579
6  2029       699.918091
7  2030       744.891602
8  2031       790.248596
9  2032       835.790283


  years = pd.date_range(start=start_date + pd.DateOffset(years=1), periods=10, freq='Y')


In [None]:
# Calculate the percentage increase compared to the previous year
future_df['Percentage Increase'] = future_df['Predicted Close'].pct_change() * 100

In [None]:
start_year = 2023
# CAGR calculation
start_price = future_df.loc[0, 'Predicted Close']
# Calculate CAGR from the start year to each year
future_df['CAGR from Start (%)'] = ((future_df['Predicted Close'] / start_price) **
                                     (1 / (future_df['Year'] - start_year)) - 1) * 100

# Fix the value for the starting year to NaN since it's the initial year (no growth to calculate)
future_df.loc[future_df['Year'] == start_year, 'CAGR from Start (%)'] = np.nan

In [None]:
print(future_df)

   Year  Predicted Close  Percentage Increase  CAGR from Start (%)
0  2023       449.269257                  NaN                  NaN
1  2024       487.819366             8.580626             8.580625
2  2025       527.891235             8.214485             8.397404
3  2026       569.321899             7.848334             8.214071
4  2027       611.932007             7.484365             8.031178
5  2028       655.530579             7.124746             7.849281
6  2029       699.918091             6.771231             7.668853
7  2030       744.891602             6.425536             7.490351
8  2031       790.248596             6.089067             7.314184
9  2032       835.790283             5.762959             7.140709
