## INTRODUCTION

In this notebook, we have the first version of a model that aims to predict the closing price of assets that make up the S&P 500. The idea for the next versions is to use datasets with news and accounting data so that we have as many inputs as possible that have some kind of influence on whether a stock is valued or devalued.

In [1]:
#imports

import pandas as pd
import numpy as np
import yahoo_fin.stock_info as si
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

SEED = 23

### Load the data

The dataset contains historical stock price data for the companies that make up the S&P 500 index, spanning from 2018 to 2023.]


In [2]:
tickers_sp500 = si.tickers_sp500()
tickers_sp500[0]

'A'

In [3]:
df = si.get_data('A', start_date="2018-01-01", end_date="2023-04-10", index_as_date=False, interval="1d" )

In [4]:
df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2018-01-02,67.419998,67.889999,67.339996,67.599998,64.98925,1047800,A
1,2018-01-03,67.620003,69.489998,67.599998,69.32,66.642838,1698900,A
2,2018-01-04,69.540001,69.82,68.779999,68.800003,66.142906,2230700,A
3,2018-01-05,68.730003,70.099998,68.730003,69.900002,67.200424,1632500,A
4,2018-01-08,69.730003,70.330002,69.550003,70.050003,67.344627,1613400,A


In [5]:
tickers_sp500.remove('A')

In [6]:
for ticker in tickers_sp500:
    df_new = si.get_data(ticker, start_date="2018-01-01", end_date="2023-04-10", index_as_date=False, interval="1d" )
    df = pd.concat([df, df_new])


In [7]:
df.shape

(660684, 8)

In [8]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume', 'ticker'], dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660684 entries, 0 to 1324
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      660684 non-null  datetime64[ns]
 1   open      660684 non-null  float64       
 2   high      660684 non-null  float64       
 3   low       660684 non-null  float64       
 4   close     660684 non-null  float64       
 5   adjclose  660684 non-null  float64       
 6   volume    660684 non-null  int64         
 7   ticker    660684 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 45.4+ MB


In [10]:
df.corr()

Unnamed: 0,open,high,low,close,adjclose,volume
open,1.0,0.999901,0.999891,0.999789,0.999492,-0.100347
high,0.999901,1.0,0.999859,0.999899,0.999623,-0.099799
low,0.999891,0.999859,1.0,0.999902,0.999586,-0.100919
close,0.999789,0.999899,0.999902,1.0,0.999703,-0.100358
adjclose,0.999492,0.999623,0.999586,0.999703,1.0,-0.097832
volume,-0.100347,-0.099799,-0.100919,-0.100358,-0.097832,1.0


In [11]:
df = df.drop(['volume'], axis=1)

In [12]:
df.head()

Unnamed: 0,date,open,high,low,close,adjclose,ticker
0,2018-01-02,67.419998,67.889999,67.339996,67.599998,64.98925,A
1,2018-01-03,67.620003,69.489998,67.599998,69.32,66.642838,A
2,2018-01-04,69.540001,69.82,68.779999,68.800003,66.142906,A
3,2018-01-05,68.730003,70.099998,68.730003,69.900002,67.200424,A
4,2018-01-08,69.730003,70.330002,69.550003,70.050003,67.344627,A


In [13]:
df = df.drop(['date', 'ticker'], axis=1)
df.head()

Unnamed: 0,open,high,low,close,adjclose
0,67.419998,67.889999,67.339996,67.599998,64.98925
1,67.620003,69.489998,67.599998,69.32,66.642838
2,69.540001,69.82,68.779999,68.800003,66.142906
3,68.730003,70.099998,68.730003,69.900002,67.200424
4,69.730003,70.330002,69.550003,70.050003,67.344627


In [14]:
cols = list(df.columns)
cols

['open', 'high', 'low', 'close', 'adjclose']

In [15]:
cols.remove('close')

In [16]:
cols

['open', 'high', 'low', 'adjclose']

In [17]:
X = df[cols]
X.head()

Unnamed: 0,open,high,low,adjclose
0,67.419998,67.889999,67.339996,64.98925
1,67.620003,69.489998,67.599998,66.642838
2,69.540001,69.82,68.779999,66.142906
3,68.730003,70.099998,68.730003,67.200424
4,69.730003,70.330002,69.550003,67.344627


In [18]:
y = df['close']
y.head()

0    67.599998
1    69.320000
2    68.800003
3    69.900002
4    70.050003
Name: close, dtype: float64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, test_size=0.30)

## Grid Search


In [20]:
regressor = LinearRegression()
parameters = {'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X':[True, False]}
grid_search = GridSearchCV(regressor, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'copy_X': True, 'fit_intercept': True, 'normalize': False}


## Model

### LinearRegression

In [21]:
model= LinearRegression(copy_X=True, fit_intercept=True, normalize= False)
model.fit(X_train, y_train)

pred = model.predict(X_test)

r2 = r2_score(y_test, pred)

print(r2)

0.9999241018105294
