In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA

In [None]:
df = pd.read_csv('/kaggle/input/stock-time-series-20050101-to-20171231/GOOGL_2006-01-01_to_2018-01-01.csv')
df.drop(['Open','High','Low','Volume','Name','Date'], axis=1, inplace=True)
plt.figure(figsize=(12,8))
df['Close'].plot()
plt.title('Stock Price of Google[2006-2018]')

### It is clear from graph that our series is non-stationary and to confirm it let's use Augmented Dickey Fuller Test.
### The null hypothesis of ADF test is that series is non-stationary. So, if p-value of the test is less than significance level(0.05) than we reject null hypothesis.

In [None]:
check = adfuller(df['Close'])
print('test_statics',check[0])
print('p_value',check[1])

### The p-value in our case is almost equal to 1. So we conclude that series is Non-Stationary. To convert this series into stationary we will use differencing.

In [None]:
# First converting into log values that using pandas difference funciton.
df['log'] = np.log(df['Close'])
df['diff'] = df['log'].diff(periods=1)
plt.figure(figsize=(10,8))
df['diff'].plot()
plt.title('Stationary series after lag difference.')

In [None]:
# Let's run ADF test again on differenced sereis:
check = adfuller(df['diff'].dropna())
print('test_statics',check[0])
print('p_value',check[1])

In [None]:
# Training and test set: (90 %)
train = df['diff'][1:int(len(df)*.9)]
test = df['diff'][len(train): ]

## Using ARIMA:

In [None]:
# Build model:
model = ARIMA(train, order=(0,2,2))
model_fit = model.fit()
print(model_fit.summary())

# Forecast:
result_arima = model_fit.predict(len(train),3019)

# Reversing into stock price:
df['Arima_Prediction'] = np.nan  # creating an empty column.
result_arima = np.r_[df['log'][2714], result_arima].cumsum()
# opposite of log is exponential()
result_arima = np.exp(result_arima)
# Assigning final predicted values to dataframe.
df['Arima_Prediction'][2714: ] = result_arima

# Plot
plt.figure(figsize=(12,8))
df['Close'].plot()
df['Arima_Prediction'].plot()
plt.legend()

In [None]:
print('Mean Absolute Percentage Error(MAPE):',
      np.mean(np.abs(result_arima - df['Close'][2714:])/np.abs(df['Close'][2714:])))

### Let us predict stock price considering seasonality.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

seasonality = seasonal_decompose(df['Close'], model='multiplicative',period=30)
seasonality.plot()

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

model_season = SARIMAX(train, order=(0,2,1),seasonal_order=(2,1,0,12))
model_season = model_season.fit(disp=False)
print(model_season.summary())

# Forecase:
seasonality_result = model_fit.predict(len(train),3019)

# Reversing to Original stock Price using cumsum():
df['final_prediction_seasonality'] = np.nan  # creating an empty column.
final_result_season = np.r_[df['log'][2714], seasonality_result].cumsum()
# opposite of log is exponential()
final_result_season = np.exp(final_result_season)
# Assigning final predicted values to dataframe.
df['final_prediction_seasonality'][2714: ] = final_result_season

# Plot:
plt.figure(figsize=(12,8))
df['Close'].plot()
df['final_prediction_seasonality'].plot()
plt.legend()
#df['forecast_sar'] = model_fit.predict(len(train),3019)
#df[['Close','forecast_sar']].plot(figsize=(15,8))

In [None]:
print('Mean Absolute Percentage Error(MAPE):',
      np.mean(np.abs(final_result_season - df['Close'][2714:])/np.abs(df['Close'][2714:])))