# Workshop - Data Science Projekt
## Use Packages

In [None]:
# Remote data access for pandas
import pandas_datareader as webreader
# Mathematical functions
import math
# Fundamental package for scientific computing with Python
import numpy as np
# Additional functions for analysing and manipulating data
import pandas as pd
# Date Functions
from datetime import date, timedelta, datetime
# This function adds plotting functions for calender dates
from pandas.plotting import register_matplotlib_converters
# Important package for visualization - we use this to plot the market data
import matplotlib.pyplot as plt
# Formatting dates
import matplotlib.dates as mdates
# Packages for measuring model performance / errors
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Deep learning library, used for neural networks
from keras.models import Sequential
# Deep learning classes for recurrent and regular densely-connected layers
from keras.layers import LSTM, Dense, Dropout
# EarlyStopping during model training
from keras.callbacks import EarlyStopping
# This Scaler removes the median and scales the data according to the quantile range to normalize the price data
from sklearn.preprocessing import RobustScaler
# For detailed configuration of the optimizer
from keras.optimizers import Adam

## Dataset

In [None]:
# Setting the timeframe for the data extraction
today = date.today()
date_today = today.strftime("%Y-%m-%d")
date_start = '2010-01-01'

# Getting NASDAQ quotes
stockname = 'NASDAQ'
symbol = '^IXIC'
df = webreader.DataReader(
    symbol, start=date_start, end=date_today, data_source="yahoo"
)

# Quick overview of dataset
train_dfs = df.copy()
train_dfs

## Plot Data

In [None]:
# Plot each column
register_matplotlib_converters()
nrows = 3
ncols = int(round(train_dfs.shape[1] / nrows, 0))
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, figsize=(16, 7))
fig.subplots_adjust(hspace=0.3, wspace=0.3)
x = train_dfs.index
columns = train_dfs.columns
f = 0
for i in range(nrows):
    for j in range(ncols):
        ax[i, j].xaxis.set_major_locator(mdates.YearLocator())
        assetname = columns[f]
        y = train_dfs[assetname]
        f += 1
        ax[i, j].plot(x, y, color='#039dfc', label=stockname, linewidth=1.0)
        ax[i, j].set_title(assetname)
        ax[i, j].tick_params(axis="x", rotation=90, labelsize=10, length=0)
plt.show()


## Some Data Wrangling

In [None]:
# Indexing Batches
train_df = train_dfs.sort_values(by=['Date']).copy()

# We safe a copy of the dates index, before we need to reset it to numbers
date_index = train_df.index
date_index_df = pd.DataFrame(date_index)

# Adding Month and Year in separate columns
d = pd.to_datetime(train_df.index)
train_df['Day'] = d.strftime("%d")
train_df['Month'] = d.strftime("%m")
train_df['Year'] = d.strftime("%Y")

# We reset the index, so we can convert the date-index to a number-index
train_df.reset_index(level=0, inplace=True)
train_df.tail(5)

## Feature Engineering

In [None]:
# Feature Engineering
def createFeatures(df):
    df = pd.DataFrame(df)

    # Moving averages - different periods
    df['MA200'] = df['Close'].rolling(window=200).mean()
    df['MA100'] = df['Close'].rolling(window=100).mean()
    df['MA50'] = df['Close'].rolling(window=50).mean()
#    df['MA26'] = df['Close'].rolling(window=26).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()
#    df['MA12'] = df['Close'].rolling(window=12).mean()

    # SMA Differences - different periods
    df['DIFF-MA200-MA50'] = df['MA200'] - df['MA50']
    df['DIFF-MA200-MA100'] = df['MA200'] - df['MA100']
    df['DIFF-MA200-CLOSE'] = df['MA200'] - df['Close']
    df['DIFF-MA100-CLOSE'] = df['MA100'] - df['Close']
    df['DIFF-MA50-CLOSE'] = df['MA50'] - df['Close']

    # Moving Averages on high, lows, and std - different periods
    df['MA200_low'] = df['Low'].rolling(window=200).min()
    df['MA14_low'] = df['Low'].rolling(window=14).min()
    df['MA200_high'] = df['High'].rolling(window=200).max()
    df['MA14_high'] = df['High'].rolling(window=14).max()
    df['MA20dSTD'] = df['Close'].rolling(window=20).std()

    # Exponential Moving Averages (EMAS) - different periods
    df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
 #   df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
    df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
    df['EMA100'] = df['Close'].ewm(span=100, adjust=False).mean()
    df['EMA200'] = df['Close'].ewm(span=200, adjust=False).mean()

    # Shifts (one day before and two days before)
    df['close_shift-1'] = df.shift(-1)['Close']
    df['close_shift-2'] = df.shift(-2)['Close']

    # Bollinger Bands
    df['Bollinger_Upper'] = df['MA20'] + (df['MA20dSTD'] * 2)
    df['Bollinger_Lower'] = df['MA20'] - (df['MA20dSTD'] * 2)

    # Relative Strength Index (StochRSI)
#    df['K-ratio'] = 100 * ((df['Close'] - df['MA14_low']) / (df['MA14_high'] - df['MA14_low']))
#    df['StochRSI'] = df['K-ratio'].rolling(window=3).mean()

    # Moving Average Convergence/Divergence (MACD)
#    df['MACD'] = df['EMA12'] - df['EMA26']

    # Replace nas
    nareplace = df.at[df.index.max(), 'Close']
    df.fillna((nareplace), inplace=True)

    return df

## Create Dataset

In [None]:
# Create the dataset with features
data = createFeatures(train_df)