# Create a stock predictor to be uploaded to streamlit

In [2]:
# import

import yfinance as yf

import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
import pmdarima as pm

import sqlite3

# Define Functions

### Function: get_data()

In [3]:
# define a function to get the data

def get_data(ticker):
  # download most recent 1 year of prices from yfinance
  ticker = ticker
  start_date = datetime.date.today() - relativedelta(years=3)
  end_date = datetime.date.today()
  data = yf.download(ticker, start=start_date, end=end_date)

  # remove multi-index
  data = data.droplevel(level=1, axis=1).reset_index()

  # create a dataframe with just the Date and Close price
  close = pd.DataFrame(columns=['Date', 'Close'])
  close['Date'] = data['Date']
  close['Close'] = data['Close']
  close = close.set_index('Date')

  # manually set the frequency to business days, and fill missing dates with forward fill
  close = close.asfreq('B')
  close['Close'] = close['Close'].fillna(method='ffill')

  return close

In [4]:
# check get_data function
close = get_data('DAL')
close

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
  close['Close'] = close['Close'].fillna(method='ffill')


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-04,38.695847
2022-04-05,38.332458
2022-04-06,36.918194
2022-04-07,36.466419
2022-04-08,36.083389
...,...
2025-03-27,46.150002
2025-03-28,43.840000
2025-03-31,43.599998
2025-04-01,42.419998


In [5]:
# check the dataframe's info to ensure index type is datetime, frequency is B
close.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 783 entries, 2022-04-04 to 2025-04-02
Freq: B
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   783 non-null    float64
dtypes: float64(1)
memory usage: 12.2 KB


### Function: transform_log()

In [6]:
# define a function to make the data stationary

def transform_log(close):
  # remove non-constant variance by taking the log of the data
  close_log = close.copy()
  close_log['Close'] = np.log(close['Close'])

  return close_log

In [7]:
# check the transform_log function
close_log = transform_log(close)

# check the transformed dataframe
close_log

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-04,3.655732
2022-04-05,3.646297
2022-04-06,3.608704
2022-04-07,3.596392
2022-04-08,3.585833
...,...
2025-03-27,3.831897
2025-03-28,3.780547
2025-03-31,3.775057
2025-04-01,3.747620


### Function: search_autoARIMA()

In [8]:
# create a function to use autoARIMA to search for the best hyperparameters

def search_autoARIMA(close_log):
  # train, test split
  train, test = train_test_split(close_log['Close'].dropna(), test_size=30, shuffle=False)

  # set the search parameters
  pmarima = pm.AutoARIMA(start_p=0, max_p=5, start_d=0, max_d=2, start_q=0, max_q=5, seasonal=False, random_state=42,
                        stepwise=True, suppress_warnings=True, error_action='ignore', trace=True)

  # fit the model
  pmarima.fit(train)

  # save the best model
  train_model=pmarima.model_
  train_summary = train_model.summary()

  # get the test predictions
  test_pred, train_confs = train_model.predict(n_periods=len(test), return_conf_int=True)

  return pmarima, train_model, train_summary, test_pred, train_confs, train, test


### Function: fit_all()

In [9]:
# define a function to re-fit on ALL the data and get the best model

def fit_all(pmarima, close_log, forecast_window):
  pmarima.fit(close_log['Close'].dropna())
  final_model = pmarima.model_
  final_summary = final_model.summary()
  final_predictions, final_confs = final_model.predict(n_periods=forecast_window, return_conf_int=True)

  return final_predictions, final_summary, final_confs


### Function: get_predictions()

In [30]:
# function for a user to predict a stock of their choice and choose how many days to forecast

def get_predictions(ticker, forecast_window):
  close = get_data(ticker)
  close_log = transform_log(close)
  pmarima, train_model, train_summary, test_pred, train_confs, train, test = search_autoARIMA(close_log)
  final_predictions, final_summary, final_confs = fit_all(pmarima, close_log, forecast_window)

  predictions = pd.DataFrame(np.exp(final_predictions), columns=[ticker])

  return {
    'predictions': predictions,
    'train_model': train_model,
    'train_summary': train_summary,
    'test_pred': test_pred,
    'train_confs': train_confs,
    'train': train,
    'test': test,
    'final_summary': final_summary,
    'final_confs': final_confs
  }

In [31]:
# test the get_predictions function

predictions_DAL = get_predictions('DAL', 30)

[*********************100%***********************]  1 of 1 completed


  close['Close'] = close['Close'].fillna(method='ffill')


Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-3618.116, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-3616.118, Time=0.02 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-3616.107, Time=0.03 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-3619.367, Time=0.02 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-3614.464, Time=0.10 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 0.205 seconds
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-3711.564, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-3709.844, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-3709.814, Time=0.08 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-3713.531, Time=0.02 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-3710.874, Time=0.10 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 0.257 seconds


In [32]:
# check the predictions

predictions_DAL['predictions']

Unnamed: 0,DAL
2025-04-03,43.369999
2025-04-04,43.369999
2025-04-07,43.369999
2025-04-08,43.369999
2025-04-09,43.369999
2025-04-10,43.369999
2025-04-11,43.369999
2025-04-14,43.369999
2025-04-15,43.369999
2025-04-16,43.369999


In [33]:
# test the get_predictions function

predictions_PLTR = get_predictions('PLTR', 30)

[*********************100%***********************]  1 of 1 completed


  close['Close'] = close['Close'].fillna(method='ffill')


Performing stepwise search to minimize aic
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=-2175.009, Time=0.15 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=-2365.828, Time=0.03 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.20 sec
 ARIMA(0,2,0)(0,0,0)[0]             : AIC=-2177.008, Time=0.02 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=-2413.892, Time=0.04 sec
 ARIMA(3,2,0)(0,0,0)[0] intercept   : AIC=-2435.488, Time=0.08 sec
 ARIMA(4,2,0)(0,0,0)[0] intercept   : AIC=-2473.774, Time=0.10 sec
 ARIMA(5,2,0)(0,0,0)[0] intercept   : AIC=-2503.614, Time=0.10 sec
 ARIMA(5,2,1)(0,0,0)[0] intercept   : AIC=-2600.528, Time=0.55 sec
 ARIMA(4,2,1)(0,0,0)[0] intercept   : AIC=-2621.201, Time=0.40 sec
 ARIMA(3,2,1)(0,0,0)[0] intercept   : AIC=-2619.731, Time=0.33 sec
 ARIMA(4,2,2)(0,0,0)[0] intercept   : AIC=-2612.432, Time=0.40 sec
 ARIMA(3,2,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.34 sec
 ARIMA(5,2,2)(0,0,0)[0] intercept   : AIC=-2624.863, Time=0.60 sec
 ARIMA(5,2,3)(0,0,0)[0] interce



 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=-2719.641, Time=0.09 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-2718.360, Time=0.06 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=-2715.706, Time=0.11 sec
 ARIMA(1,1,0)(0,0,0)[0]             : AIC=-2719.724, Time=0.03 sec
 ARIMA(2,1,0)(0,0,0)[0]             : AIC=-2719.910, Time=0.02 sec
 ARIMA(3,1,0)(0,0,0)[0]             : AIC=-2719.167, Time=0.04 sec
 ARIMA(2,1,1)(0,0,0)[0]             : AIC=-2718.145, Time=0.11 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=-2718.547, Time=0.03 sec
 ARIMA(3,1,1)(0,0,0)[0]             : AIC=-2716.889, Time=0.02 sec

Best model:  ARIMA(2,1,0)(0,0,0)[0]          
Total fit time: 0.630 seconds


In [34]:
# check the predictions

predictions_PLTR['predictions']

Unnamed: 0,PLTR
2025-04-03,87.672881
2025-04-04,87.838631
2025-04-07,87.86267
2025-04-08,87.873213
2025-04-09,87.87526
2025-04-10,87.875968
2025-04-11,87.876129
2025-04-14,87.876178
2025-04-15,87.87619
2025-04-16,87.876194
