# DATA2001 Assingment 3

## Data Preparation

In [None]:
import pandas as pd
import numpy as np

# Read the data
df = pd.read_csv("amzn.us.csv")
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Display the first 5 rows
df.head()

: 

In [None]:
print(type(df['Date'][2])) 

: 

**Exogenous Data: VIX Index**

"Volatility Index, a popular measure of the stock market's expectation of volatility based on S&P 500 index options. It is calculated and disseminated on a real-time basis by the CBOE, and is often referred to as the fear index or fear gauge." Source: [Wikipedia](https://en.wikipedia.org/wiki/VIX)

It is useful exogenous variable because it gives a broad sense of the state the world in relation to the market. It is highly correlated to news events that are expected to negativly effect markets. In this case it is presented as a 30-day expection of volatility given a weighted portfolio of european options on the S&P 500. Its sourced from  [Yahoo Finance](https://finance.yahoo.com/quote/%5EVIX?p=%5EVIX) for the relevant period. 

In [None]:
vix = pd.read_csv('VIX.csv')
vix.head(5)

: 

In [None]:
#Dropping unneeded columns and keeping close for merge with amazon dataset
vix=vix['Close']
vix = vix.rename('VIXClose')
df['VIX Close']= vix

: 

# Exploratory Analysis

In [None]:
import seaborn as sns 
from datetime import datetime, date 

#plotting the data
f, ax = plt.subplots(nrows=7, ncols=1, figsize=(15, 25))

for i, column in enumerate(df.drop('Date', axis=1).columns):
    sns.lineplot(x=df['Date'], y=df[column].fillna(method='ffill'), ax=ax[i], color='dodgerblue')
    ax[i].set_title('{}'.format(column), fontsize=14)
    ax[i].set_ylabel(ylabel=column, fontsize=14)
    ax[i].set_xlabel('')
                      
    ax[i].set_xlim([date(1997, 5, 16), date(2017, 11, 9)])      

: 

Some comments on the patterns visible. 

# Decomposition 

In [None]:
# Split data into training and test sets
train = df.iloc[:5081,:]
test = df.iloc[5081:,:]

: 

### Isolating Trend

In [None]:
# There are 5 observations per week
train['MA-60'] = train['Volume'].rolling(60).mean()
train['MA-120'] = train['Volume'].rolling(120).mean()

train[['Volume','MA-60','MA-120']].plot(figsize=(24,6))

: 

The MA-120 appears to be too long. You can determine this by observings the effect big changes have on it and how it appears to 'lag' a little more then MA-60.


### De-trending

In [None]:
train['vol-detrended'] = train['Volume'] - train['MA-120']
train['vol-detrended'].plot(figsize=(24,6))

: 

### Seasonality Component 

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(train['Volume'], model='additive',period=10)

train['vol-seasonal'] = result._seasonal
result._seasonal[:100].plot()

: 

### Residual

In [None]:
train['vol-residual'] = train['vol-detrended'] - train['vol-seasonal']
train['vol-residual'].plot(figsize=(24,6))

: 

### Manual Decomposition Results


In [None]:
import matplotlib.pyplot as plt
fig, ax_str = plt.subplots(4)
train['Volume'].plot(label='Original', ax=ax_str[0])
train['MA-120'].plot(color='orange', label='MA-120 Trend', ax=ax_str[1])
train['vol-seasonal'].plot(color='blue', label='Seasonal', ax=ax_str[2])
train['vol-residual'].plot(color='green', label= 'Residual', ax=ax_str[3])
fig.set_size_inches((16, 9))
plt.legend()
plt.show()

: 

### Automatic Decomposition  

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

str_res = seasonal_decompose(train['Volume'],period = 30)
fig = str_res.plot()
fig.set_size_inches((16, 9))

: 

Some comments on the manual vs automatic. 

### Formate the data for models

In [None]:
df.drop('OpenInt', axis=1) # OpenInt is a constant column

#Overite the original dataframe with the automatic decomposition
str_res = seasonal_decompose(df['Volume'],period = 30)

data = str_res.observed.to_frame()
data['seasonal'] = str_res.seasonal
data['trend']=str_res.trend
data['res']=str_res.resid
data['obs']=str_res.observed

train = data.iloc[:5081,:]
test = data.iloc[5081:,:]

: 

# ARIMA Model

### Stationarity?

Data needs to be stationary for ARIMA to be accurate. This means that it is:
- has a constant variance 
- patternless over time 
- roughly horizontal 

In [None]:
m_data =train['trend']
m_data.plot(color='blue', figsize=(24,6))

: 

### Fit Model

In [None]:
# Some tools 

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_predict
import statsmodels.api as sm

: 

#### Autocorrelations? 

In [None]:
m_data = m_data.dropna()
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(m_data, lags=30, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(m_data, lags=30, ax=ax2)

: 

### Fit Model

In [None]:
arima_1_0_0 = ARIMA(train['trend'], order=(1, 0, 0)).fit()
print(arima_1_0_0.summary())

: 

In [None]:
fig = plt.figure(figsize=(16, 9))
fig = arima_1_0_0.plot_diagnostics(fig=fig, lags=8)

: 

In [None]:
arima_2_0_1 = ARIMA(train['trend'], order=(2, 0, 1)).fit()
print(arima_2_0_1.summary())


: 

In [None]:
fig = plt.figure(figsize=(16, 9))
fig = arima_2_0_1.plot_diagnostics(fig=fig, lags=8)

: 

Some comments about goodness of fit. 

### Forcast with ARIMA

In [None]:
arima_fcst = arima_2_0_1.get_forecast(steps=11)
arima_predictions = pd.DataFrame(arima_fcst.predicted_mean)
arima_predictions.rename(columns={"predicted_mean": "trend"}, inplace=True)

: 

In [None]:
fig, ax_arima_fcst = plt.subplots(figsize=(24,6))
train['trend'][5000:].plot(label='Original', ax=ax_arima_fcst)
arima_predictions['trend'].plot(label="ARIMA(2,0,1) trend fcst", ax=ax_arima_fcst)
plt.legend()


: 

In [None]:
fig, ax_arima_fcst = plt.subplots(figsize=(24,6))
train['trend'][5000:].plot(label='Original', ax=ax_arima_fcst)
fig = plot_predict(arima_2_0_1, start = 5080, end= 5100, label='ARIMA(2,0,1) Forecast', ax=ax_arima_fcst)

: 

# Pure Forecaster

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

: 

In [None]:
train

: 

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.fit_transform(test)


: 

In [None]:
#training data 
y_train = train_scaled[:, 0]
X_train = train_scaled[:, 1:]

#test data
y_test = test_scaled[:, 0]
X_test = test_scaled[:, 1:]

: 

In [None]:
def create_model(x_train):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


: 

In [None]:
model = create_model(X_train)

model.fit(X_train,y_train, epochs =20 , batch_size=20, shuffle=False)

: 

In [None]:
# Predict
y_pred = model.predict(X_test)

: 

In [None]:
y_pred

: 