In [72]:
import datetime
import quandl
import pandas as pd
import math, csv
import numpy as np
from sklearn import preprocessing, model_selection, svm
from sklearn.linear_model import LinearRegression
import yfinance as yf
import matplotlib as plt
from matplotlib import style
style.use("ggplot")

In [6]:
yf.pdr_override()

start_date =  "2010-01-01"
end_date = "2023-07-01"
goog_ticker = yf.Ticker("GOOG")

df = goog_ticker.history(period="5y")  # Get stock data and read csv
# print(df.head())
# df.info()

# Defining new columns
df["HL_PCT"] = (df["High"] - df["Close"])/df["Close"] * 100.0
df["PCT_CHANGE"] = (df["Close"] - df["Open"])/df["Open"] * 100.0

# Renaming and Reducing the number of columns in the df
df = df.rename(columns={'Open': 'OPEN', 'High': 'HIGH', 'Low': 'LOW', 'Close':'CLOSE', 'Volume':'VOLUME'})
df = df[["CLOSE", "HL_PCT", "PCT_CHANGE", "VOLUME"]] # Getting rid of redundant features helps simple ML models run
# print(df.head())

In [7]:
# Defining a label
forecast_col = 'CLOSE' # Not necessary, just doing this so this could be changed whatever in the future
df.fillna(-9999999, inplace = True) # This replaces NA values with an outliers which will be later removed
forecast_out = int(math.ceil(0.05*len(df))) # Use 5% of data to predict tomorrow's price
# print(forecast_out)
df['label'] = df[forecast_col].shift(-forecast_out) # Shift close column by some % backwards
# df.dropna(inplace=True)
# print(df.head())

In [8]:
# Creating features and label arrays to use in model
X = np.array(df.drop(['label'], axis = 1)) # X is usually the features
X = X[:-forecast_out] # Take all elements from beginning to end of shift
X_lately = X[-forecast_out:] # All elements shifted
X = preprocessing.scale(X) # Scaling all the data, might be skipping for time optimization
# X = X[:-forecast_out+1]

# y is usually the labels
df.dropna(inplace=True)
y = np.array(df['label'])

In [44]:
# Creating X and y training and testing datasets
# Specify test dataset size, here it's 20%
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

# Creating Model
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

# Predict the "Unknown"
# We're actually only predicting forecast_out
forecast_set = clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)

[1.15704233e+08 5.83574352e+07 5.10121797e+07 5.02982022e+07
 4.37339393e+07 4.21694253e+07 3.67900226e+07 3.35626859e+07
 3.53655432e+07 3.83617387e+07 3.70270828e+07 2.68867483e+07
 3.61413526e+07 3.11452993e+07 2.76017319e+07 3.57813312e+07
 3.34693212e+07 2.85161683e+07 3.00467054e+07 2.89152911e+07
 3.88661934e+07 3.72791378e+07 3.82200956e+07 4.53935199e+07
 6.44789909e+07 9.00813752e+07 3.08025244e+07 3.91892200e+07
 3.82594086e+07 3.71342570e+07 2.98589108e+07 3.00448052e+07
 2.94770088e+07 3.09378652e+07 2.95909366e+07 3.32563643e+07
 2.45158297e+07 2.41104006e+07 2.58696047e+07 4.10365321e+07
 2.33583798e+07 2.21514000e+07 2.69312892e+07 2.56171107e+07
 2.45618566e+07 3.43630637e+07 2.08737869e+07 1.97979107e+07
 2.66399209e+07 2.64786700e+07 2.53333743e+07 3.71605937e+07
 4.38568227e+07 4.52375607e+07 2.83466653e+07 2.47600891e+07
 2.40700814e+07 2.02525902e+07 2.34045897e+07 2.44985924e+07
 2.04297526e+07 2.93220897e+07 5.62381068e+07] 0.850635930053938 63


In [58]:
# Plot results
df['Forecast'] = np.nan # NA Columns for Forecasts
last_date = df.iloc[-1].name # Finding end date
last_unix = last_date.timestamp() # Converting type
one_day = 86400 
next_unix = last_date.timestamp() + one_day # Next day is day added to last day


In [59]:
for i in forecast_set:
    # For the length of the forecast column
    # Find the next day
    next_date = datetime.datetime.fromtimestamp(next_unix)
    # Add 1 Day
    next_unix += one_day
    # Add row after existing data
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

In [71]:
df.tz_localize(None)
df['CLOSE'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Data')
plt.ylabel('Price')
plt.show()

TypeError: index is not a valid DatetimeIndex or PeriodIndex

Unnamed: 0_level_0,CLOSE,HL_PCT,PCT_CHANGE,VOLUME,label,Forecast
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-08-10 00:00:00-04:00,61.880501,0.653277,-0.433629,22174000.0,54.119999,
2018-08-13 00:00:00-04:00,61.750500,1.154889,-0.159258,19946000.0,53.307499,
2018-08-14 00:00:00-04:00,62.105000,0.303517,0.559430,26962000.0,51.931499,
2018-08-15 00:00:00-04:00,60.719002,1.717748,-1.210483,36576000.0,51.802502,
2018-08-16 00:00:00-04:00,60.324501,1.617085,-1.489306,26864000.0,52.182999,
...,...,...,...,...,...,...
2018-12-27 00:00:00-05:00,52.194000,0.000957,2.627935,42196000.0,58.665501,
2018-12-28 00:00:00-05:00,51.854000,1.781926,-1.194716,28296000.0,59.721500,
2018-12-31 00:00:00-05:00,51.780499,1.650235,-1.460573,29866000.0,60.024502,
2019-01-02 00:00:00-05:00,52.292500,0.618639,2.880275,30652000.0,60.296001,
