In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [65]:
data = pd.read_csv('Downloads/amazon_stock_price.csv')

In [67]:
data.head()

Unnamed: 0,date,open,high,low,close,volume
0,1997-05-15,0.1219,0.125,0.0964,0.0979,1443120000
1,1997-05-16,0.0984,0.099,0.0854,0.0865,294000000
2,1997-05-19,0.088,0.0885,0.0812,0.0854,122136000
3,1997-05-20,0.0865,0.0875,0.0818,0.0818,109344000
4,1997-05-21,0.0818,0.0823,0.0688,0.0714,377064000


In [69]:
data = data.sort_values('date')

In [71]:
data['date'] = pd.to_datetime(data['date'])

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6685 entries, 0 to 6684
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    6685 non-null   datetime64[ns]
 1   open    6685 non-null   float64       
 2   high    6685 non-null   float64       
 3   low     6685 non-null   float64       
 4   close   6685 non-null   float64       
 5   volume  6685 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 313.5 KB


In [75]:
data.set_index('date', inplace=True)

In [77]:
def fill_dates(data):
    data_copy = data.copy()
    full_range = pd.date_range(start=data_copy.index.min(), end=data_copy.index.max(), freq='1D')
    data_copy = data_copy.reindex(full_range)

    return data_copy

In [85]:
data.drop(columns=['open', 'high', 'low', 'volume'], inplace=True)

In [91]:
data = fill_dates(data)

In [93]:
data.head()

Unnamed: 0,close
1997-05-15,0.0979
1997-05-16,0.0865
1997-05-17,
1997-05-18,
1997-05-19,0.0854


In [95]:
data = data.interpolate(method='linear', limit_direction='both')

In [97]:
data.head()

Unnamed: 0,close
1997-05-15,0.0979
1997-05-16,0.0865
1997-05-17,0.086133
1997-05-18,0.085767
1997-05-19,0.0854


In [99]:
for feature in data.columns:
    for lag in (1,2,3):
        data[f'{feature}_lag{lag}'] = data[feature].shift(lag)

In [101]:
data.head()

Unnamed: 0,close,close_lag1,close_lag2,close_lag3
1997-05-15,0.0979,,,
1997-05-16,0.0865,0.0979,,
1997-05-17,0.086133,0.0865,0.0979,
1997-05-18,0.085767,0.086133,0.0865,0.0979
1997-05-19,0.0854,0.085767,0.086133,0.0865


In [103]:
data.dropna(axis=0, inplace=True)

In [105]:
data.isna().sum()

close         0
close_lag1    0
close_lag2    0
close_lag3    0
dtype: int64

In [109]:
data = data.reset_index(drop=True)

In [111]:
data.head()

Unnamed: 0,close,close_lag1,close_lag2,close_lag3
0,0.085767,0.086133,0.0865,0.0979
1,0.0854,0.085767,0.086133,0.0865
2,0.0818,0.0854,0.085767,0.086133
3,0.0714,0.0818,0.0854,0.085767
4,0.0698,0.0714,0.0818,0.0854


In [115]:
from sklearn.model_selection import train_test_split

X = data[data.columns[1:]]
Y = data['close']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [125]:
from xgboost import XGBRegressor

model = XGBRegressor(
    objective ='reg:squarederror',
    colsample_bytree = 0.3, 
    learning_rate = 0.1,
    max_depth = 5, alpha = 2, 
    n_estimators = 10
)
model.fit(X_train, Y_train)

In [133]:
y_pred = model.predict(X_test)

In [135]:
from sklearn.metrics import r2_score

score = r2_score(Y_test, y_pred)

print(f'Score: {score}')

Score: 0.8757877790619224


In [141]:
from pmdarima import auto_arima

auto_model = auto_arima(data['close'], seasonal=False, trace=True)
print(auto_model.summary())

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=27514.698, Time=9.92 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=27536.825, Time=0.13 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=27538.604, Time=0.44 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=27538.618, Time=0.55 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=27536.974, Time=0.12 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=27532.078, Time=3.99 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=27532.200, Time=0.83 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=27527.534, Time=8.51 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=27529.604, Time=3.02 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=27537.864, Time=1.97 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=27528.279, Time=3.24 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=27534.196, Time=0.86 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=27512.927, Time=12.67 sec
 ARIMA(4,1,3)(0,0,0)[0] intercept   : AIC=27492.266, Time=13.15 sec
 ARIMA(4,1,2)(0,0

In [147]:
from statsmodels.tsa.arima.model import ARIMA

model = ARIMA(data['close'], order=(2, 1, 3))
model_fit = model.fit()

print(model_fit.summary())

                               SARIMAX Results                                
Dep. Variable:                  close   No. Observations:                 9699
Model:                 ARIMA(2, 1, 3)   Log Likelihood              -13758.804
Date:                Tue, 10 Dec 2024   AIC                          27529.607
Time:                        16:27:34   BIC                          27572.685
Sample:                             0   HQIC                         27544.211
                               - 9699                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.5063      0.425     -1.191      0.234      -1.339       0.327
ar.L2          0.2351      0.303      0.776      0.438      -0.359       0.829
ma.L1          0.5119      0.424      1.207      0.2

In [153]:
import matplotlib.pyplot as plt

forecast = model.predict(n_periods=100)

plt.figure(figsize=(10,6))
plt.plot(data['close'], label='Actual')
plt.plot(range(len(data), len(data)+100), forecast, color='red', label='Predicted')
plt.title('Actual vs Predicted')
plt.legend(loc='upper left')
plt.show()

TypeError: Model.predict() missing 1 required positional argument: 'params'