# Create a stock predictor to be uploaded to streamlit

In [29]:
# import

import yfinance as yf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

from prophet import Prophet
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm

from scipy.signal import find_peaks, periodogram

import sqlite3

  from .autonotebook import tqdm as notebook_tqdm


# Define Functions

### Function: get_data()

In [6]:
# define a function to get the data

def get_data(ticker):
  # download most recent 1 year of prices from yfinance
  ticker = ticker
  start_date = datetime.date.today() - relativedelta(years=3)
  end_date = datetime.date.today()
  data = yf.download(ticker, start=start_date, end=end_date)

  # remove multi-index
  data = data.droplevel(level=1, axis=1).reset_index()

  # create a dataframe with just the Date and Close price
  close = pd.DataFrame(columns=['Date', 'Close'])
  close['Date'] = data['Date']
  close['Close'] = data['Close']
  close = close.set_index('Date')

  # manually set the frequency to business days, and fill missing dates with forward fill
  close = close.asfreq('B')
  close['Close'] = close['Close'].fillna(method='ffill')

  return close

In [7]:
# check get_data function
close = get_data('DAL')
close

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
  close['Close'] = close['Close'].fillna(method='ffill')


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-04,38.695847
2022-04-05,38.332462
2022-04-06,36.918194
2022-04-07,36.466419
2022-04-08,36.083385
...,...
2025-03-27,46.150002
2025-03-28,43.840000
2025-03-31,43.599998
2025-04-01,42.419998


In [8]:
# check the dataframe's info to ensure index type is datetime, frequency is B
close.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 783 entries, 2022-04-04 to 2025-04-02
Freq: B
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   783 non-null    float64
dtypes: float64(1)
memory usage: 12.2 KB


### Function: transform_log_season()

In [9]:
# define a function to make the data stationary

def transform_log(close):
  # remove non-constant variance by taking the log of the data
  close_log = close.copy()
  close_log['Close'] = np.log(close['Close'])

  return close_log

In [10]:
# check the transform_log_season function
close_log = transform_log(close)

# check the transformed dataframe
close_log

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-04,3.655732
2022-04-05,3.646297
2022-04-06,3.608704
2022-04-07,3.596392
2022-04-08,3.585833
...,...
2025-03-27,3.831897
2025-03-28,3.780547
2025-03-31,3.775057
2025-04-01,3.747620


### Function: search_prophet()

In [13]:
# define a function to search for the best prophet hyperparameters


In [31]:
# create a function to try different hyperparameters in the prophet model and then loop over it

def prophet_search(close_log, changepoint_prior_scale, seasonality_prior_scale):
  train, test = train_test_split(close_log['Close'].dropna(), test_size=30, shuffle=False)

  # create a model
  m = Prophet(changepoint_prior_scale=changepoint_prior_scale, seasonality_prior_scale=seasonality_prior_scale)
  prophet_train = np.exp(train).reset_index()

  # rename the columns to ds and y
  prophet_train.columns = ['ds', 'y']
  prophet_train

  # fit the data
  m.fit(prophet_train)

  future = m.make_future_dataframe(periods=len(test), freq='B')
  forecast = m.predict(future)

  # predict the train
  train_forecast = m.predict(prophet_train[['ds']])

  # merge actual (exp(train)) and predicted (yhat) by 'ds'
  m_train_preds = prophet_train[['ds']].copy()
  m_train_preds['train'] = np.exp(train.values)
  m_train_preds = m_train_preds.merge(train_forecast[['ds', 'yhat']], on='ds', how='left')
  m_train_preds.rename(columns={'yhat': 'm_train_pred'}, inplace=True)

  # save the predictions for only the test time period
  m_test_preds = [forecast.loc[forecast['ds'] == month, 'yhat'].values[0] for month in test.index]

  # save the predictions in a dataframe
  m_test_results = pd.DataFrame(columns = ['test', 'm_test_pred'])
  m_test_results['test'] = np.exp(test)
  m_test_results['m_test_pred'] = m_test_preds

  train_RMSE = root_mean_squared_error(m_train_preds['train'], m_train_preds['m_train_pred'])
  test_RMSE = root_mean_squared_error(m_test_results['test'], m_test_results['m_test_pred'])

  return train_RMSE, test_RMSE

In [32]:
# use itertools.product to do a search over the hyperparameters

# define the hyperparameters ranges

def prophet_loop():
  changepoint_prior_scales = [0.05, 0.1, 0.2, 0.3]
  seasonality_prior_scales = [5.0, 10.0, 20.0]

  from itertools import product

  results = []

  for cps, sps in product(changepoint_prior_scales, seasonality_prior_scales):
    try:
      train_rmse, test_rmse = prophet_search(
        changepoint_prior_scale=cps,
        seasonality_prior_scale=sps,
      )
      results.append({
        'changepoint_prior_scales': cps,
        'seasonality_prior_scales': sps,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse
      })
    except Exception as e:
      print(f"Failed for cps={cps}, sps={sps} — Error: {e}")

  results_df = pd.DataFrame(results)

  # Sort by test RMSE
  best = results_df.sort_values(by='test_rmse').reset_index(drop=True)

  return best[0]

### Function: fit_all()

In [14]:
# define a function to re-fit on ALL the data and get the best model

def fit_all(pmarima, close_log, forecast_window):
  pmarima.fit(close_log['Close'].dropna())
  final_model = pmarima.model_
  final_summary = final_model.summary()
  final_predictions, final_confs = final_model.predict(n_periods=forecast_window, return_conf_int=True)

  return final_predictions, final_summary, final_confs


### Function: get_predictions()

In [15]:
# function for a user to predict a stock of their choice and choose how many days to forecast

def get_predictions(ticker, forecast_window):
  close = get_data(ticker)
  close_log = transform_log(close)
  pmarima, train_model, train_summary, test_pred, train_confs, train, test = search_autoARIMA(close_log)
  final_predictions, final_summary, final_confs = fit_all(pmarima, close_log, forecast_window)

  predictions = np.exp(final_predictions)

  return predictions

In [16]:
# test the get_predictions function

get_predictions('DAL', 30)

[*********************100%***********************]  1 of 1 completed
  close['Close'] = close['Close'].fillna(method='ffill')


NameError: name 'search_autoARIMA' is not defined

In [None]:
# test the get_predictions function

get_predictions('PLTR', 30)

[*********************100%***********************]  1 of 1 completed
  close['Close'] = close['Close'].fillna(method='ffill')


Performing stepwise search to minimize aic
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=-2175.009, Time=0.03 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=-2365.828, Time=0.02 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.17 sec
 ARIMA(0,2,0)(0,0,0)[0]             : AIC=-2177.008, Time=0.03 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=-2413.892, Time=0.04 sec
 ARIMA(3,2,0)(0,0,0)[0] intercept   : AIC=-2435.488, Time=0.08 sec
 ARIMA(4,2,0)(0,0,0)[0] intercept   : AIC=-2473.774, Time=0.10 sec
 ARIMA(5,2,0)(0,0,0)[0] intercept   : AIC=-2503.614, Time=0.12 sec
 ARIMA(5,2,1)(0,0,0)[0] intercept   : AIC=-2600.528, Time=0.55 sec
 ARIMA(4,2,1)(0,0,0)[0] intercept   : AIC=-2621.201, Time=0.40 sec
 ARIMA(3,2,1)(0,0,0)[0] intercept   : AIC=-2619.731, Time=0.32 sec
 ARIMA(4,2,2)(0,0,0)[0] intercept   : AIC=-2612.432, Time=0.43 sec
 ARIMA(3,2,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.33 sec
 ARIMA(5,2,2)(0,0,0)[0] intercept   : AIC=-2624.863, Time=0.56 sec
 ARIMA(5,2,3)(0,0,0)[0] interce



 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=-2719.641, Time=0.19 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-2718.360, Time=0.08 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=-2715.706, Time=0.10 sec
 ARIMA(1,1,0)(0,0,0)[0]             : AIC=-2719.724, Time=0.03 sec
 ARIMA(2,1,0)(0,0,0)[0]             : AIC=-2719.910, Time=0.02 sec
 ARIMA(3,1,0)(0,0,0)[0]             : AIC=-2719.167, Time=0.04 sec
 ARIMA(2,1,1)(0,0,0)[0]             : AIC=-2718.145, Time=0.11 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=-2718.547, Time=0.03 sec
 ARIMA(3,1,1)(0,0,0)[0]             : AIC=-2716.889, Time=0.02 sec

Best model:  ARIMA(2,1,0)(0,0,0)[0]          
Total fit time: 0.775 seconds


2025-04-03    87.672881
2025-04-04    87.838631
2025-04-07    87.862670
2025-04-08    87.873213
2025-04-09    87.875260
2025-04-10    87.875968
2025-04-11    87.876129
2025-04-14    87.876178
2025-04-15    87.876190
2025-04-16    87.876194
2025-04-17    87.876195
2025-04-18    87.876195
2025-04-21    87.876195
2025-04-22    87.876195
2025-04-23    87.876195
2025-04-24    87.876195
2025-04-25    87.876195
2025-04-28    87.876195
2025-04-29    87.876195
2025-04-30    87.876195
2025-05-01    87.876195
2025-05-02    87.876195
2025-05-05    87.876195
2025-05-06    87.876195
2025-05-07    87.876195
2025-05-08    87.876195
2025-05-09    87.876195
2025-05-12    87.876195
2025-05-13    87.876195
2025-05-14    87.876195
Freq: B, dtype: float64