<a href="https://colab.research.google.com/github/aabdelmak/Dissertation/blob/master/Dissertation_saved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from pandas.plotting import autocorrelation_plot
from pandas.plotting import scatter_matrix

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
from matplotlib import style
import seaborn as sns
import matplotlib
%matplotlib inline

plt.style.use("default")

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')

import tensorflow as tf
import tensorflow.keras
import sklearn as sk
from sklearn import svm, neighbors, preprocessing
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit

from datetime import datetime as dt
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from pylab import rcParams

from subprocess import check_output
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
import time #helper libraries
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from numpy import newaxis

from __future__ import absolute_import, division, print_function, unicode_literals

import os
import datetime as dt
import bs4 as bs
import pickle
import requests
import time
import numpy as np
from collections import Counter



import pandas_datareader.data as pdr
!pip install yfinance
import yfinance as yf
yf.pdr_override

import prettytable as pt



In [None]:
def get_data_from_yahoo():

    tickers = ["^GSPC", "^DJI", "^NYA", "^GDAXI", "^IXIC",
               "^FCHI", "^HSI", "^N225", "^RUT", "^VIX"]
            
    if not os.path.exists("index_dfs"):
        os.makedirs("index_dfs")
    start = dt.datetime(1991, 12, 15)
    end = dt.datetime.now()
    
    for ticker in tickers:
        print(ticker)
        
        if not os.path.exists("index_dfs/{}.csv".format(ticker)):
            time.sleep(3)
            df = pdr.get_data_yahoo(ticker, start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv("index_dfs/{}.csv".format(ticker))
            time.sleep(3)
        else:
            print("Already have {}".format(ticker))
            
get_data_from_yahoo()

def compile_data():
    tickers = ["^GSPC", "^DJI", "^NYA", "^GDAXI", "^IXIC",
               "^FCHI", "^HSI", "^N225", "^RUT", "^VIX"]
        
    main_df = pd.DataFrame()
    for count,ticker in enumerate(tickers):
        df = pd.read_csv("index_dfs/{}.csv".format(ticker))
        df.set_index("Date", inplace = True)
        
        df.rename(columns = {"Adj Close": "{} Adj Close".format(ticker),
                            "Open": "{} Open".format(ticker),
                            "High": "{} High".format(ticker),
                            "Low" : "{} Low".format(ticker),
                            "Volume":"{} Volume".format(ticker)},
                   inplace = True)
        df.drop("Close", axis = 1, inplace = True)
        
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how = "outer") 
        if count % 10 == 0:
            print(count)
        print(main_df.head())
        main_df.to_csv("Indices.csv")
compile_data()



After compiling the csv file with the desired tickers, we select the columns that we wish to load into a dataframe.


In [None]:
#pre-define dates and tickers 
tickers = ["^GSPC", "^DJI", "^NYA", "^GDAXI", "^IXIC",
           "^FCHI", "^HSI", "^N225", "^RUT", "^VIX"]
start_date = "1991-12-15"
end_date = "2020-09-15"
dates = pd.date_range(start_date, end_date)
#select columns from data
def select_columns_from_data(data, ticker_list, column_names): 
    df = pd.DataFrame(index = dates)
    for ticker in ticker_list:
      for column_name in column_names:
        df_temp = pd.read_csv("/content/Indices.csv", index_col = "Date",
                             parse_dates = True, usecols =\
                              ["Date", "{} {}".format(ticker, column_name)], 
                              na_values = ["nan"])
        df = df.join(df_temp)
        if ticker == "^GSPC": # drop dates SPY did not trade
          df = df.dropna(subset=["^GSPC {}".format(column_name)])
            
    return df    
    
price_data = select_columns_from_data(pd.read_csv("Indices.csv"), tickers, 
                         ["Adj Close", "High", "Low", "Open"])
volume_data = select_columns_from_data(pd.read_csv("Indices.csv"), tickers,
                                       ["Volume"])
        
#Fill NA values
price_data.fillna(method = "ffill", inplace = True)
price_data.fillna(method = "bfill", inplace = True)
volume_data.fillna(method = "ffill", inplace = True)
volume_data.fillna(method = "bfill", inplace = True)

#price_data[:252] #252
#price_data[253:500] #247
price_data

# Exploratory Data Analysis

In [None]:

#Plot data        
font = {'family': 'Serif',
        'color':  'black',
        'weight': 'normal',
        'size': 40,
        }

def plot_data(df, savefig, label):
    """Plot stock prices"""
    ax = df.plot(grid = True, figsize = (20, 10), cmap = "Spectral")
    datemin = np.datetime64(df.index.values[22], 'Y')
    datemax = np.datetime64(df.index.values[-1], 'Y') + np.timedelta64(1, 'Y') 
    ax.set_xlim(datemin, datemax)
    ax.set_xlabel("Date", fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    plt.title("{}".format(label),fontdict = font )
    

    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=52))
# set formatter

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    ax.get_yaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.xticks(rotation = 0)
    plt.rc("legend", fontsize=20)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(fontsize = 20)
    plt.savefig(savefig)
    
    
    plt.show()

#define closing_prices for plot
def get_closing_data_for_plot(tickers, from_data, column):
  closing_data = pd.DataFrame(index = from_data.index)
  for ticker in tickers:
    tmp = from_data.pop("{} {}".format(ticker, column))
    closing_data= closing_data.join(tmp)
  return closing_data

closing_data = get_closing_data_for_plot(tickers, price_data,"Adj Close")

closing_data
vix_closing_data = pd.DataFrame(closing_data.pop("^VIX Adj Close") ,
                                index = closing_data.index)

In [None]:
closing_data = closing_data.join(vix_closing_data["^VIX Adj Close"])

In [None]:
#scaler = MinMaxScaler(feature_range = (0,1))
#scaled_price_data = scaler.fit_transform(price_data) 
#scaled_price_data= pd.DataFrame(scaled_price_data, index = price_data.index, columns = price_data.columns)

#names = ["High", "Low","Open"]
#counter = 0

#scaled_price_data


In [None]:
plot_data(closing_data, "Adj Close.pdf","Adj Closing Prices for Selected Stock Indices")
scaled_data = closing_data.copy()
for column in scaled_data:
  scaled_data[column]= scaled_data[column]/max(scaled_data[column])
for ticker in tickers: 
  scaled_data =scaled_data.rename(columns = {"{} Adj Close".format(ticker): "{} Scaled Close".format(ticker) })
scaled_data



def plot_data(df, savefig, label):
    """Plot stock prices"""
    ax = df.plot(grid = True, figsize = (20, 10), cmap = "Spectral")
    datemin = np.datetime64(df.index.values[22], 'Y')
    datemax = np.datetime64(df.index.values[-1], 'Y') + np.timedelta64(1, 'Y') 
    ax.set_xlim(datemin, datemax)
    ax.set_xlabel("Date", fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    plt.title("{}".format(label),fontdict = font )
    

    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=54))
# set formatter

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.xticks(rotation = 0)
    plt.rc("legend", fontsize=20)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(fontsize = 20)
    plt.savefig(savefig)
    
    
    plt.show()


plot_data(scaled_data, "Scaled Close.pdf", "Scaled Closing Prices for Selected Stock Indices")


def plot_autocorrelation(df, savefig, tickerlist, datacolumn):
  fig = plt.figure()
  fig.set_figwidth(20)
  fig.set_figheight(10)
  for ticker in tickerlist:
    ax = autocorrelation_plot(df["{} {}".format(ticker, datacolumn)], label = "{}".format(ticker))
  ax.set_xlabel("Lags", fontsize = 30, fontdict = font)
  
  ax.set_ylabel("Autocorrelation", fontsize = 30, fontdict = font)
  
  plt.title("Autocorrelation Plot of Selected Indices",fontdict = font)
  plt.legend(loc = "upper right")
  plt.savefig(savefig)

#plot_autocorrelation(df = closing_data,
 #                    savefig = "Autocorrelation Plot of Closing Data.pdf",
  #                   tickerlist = tickers, datacolumn = "Adj Close")

def plot_scatter_matrix(df, savefig, tickerlist, datacolumn):
    plot_frame = pd.concat([df["{} {}".format(tickerlist[0], datacolumn)],
                            df["{} {}".format(tickerlist[1], datacolumn)],
                            df["{} {}".format(tickerlist[2], datacolumn)],
                            df["{} {}".format(tickerlist[3], datacolumn)],
                            df["{} {}".format(tickerlist[4], datacolumn)],
                            df["{} {}".format(tickerlist[5], datacolumn)],
                            df["{} {}".format(tickerlist[6], datacolumn)],
                            df["{} {}".format(tickerlist[7], datacolumn)],
                            df["{} {}".format(tickerlist[8], datacolumn)],
                            df["{} {}".format(tickerlist[9], datacolumn)]], axis = 1)
    plt.savefig("savefig1.pdf")
    scatter_matrix(plot_frame, figsize = (20,20), diagonal = "kde")
    plt.savefig(savefig)
    plt.show()
plot_autocorrelation(closing_data, "Autocorrelation Closing data.pdf",tickers, "Adj Close")
#plot_frame.describe()

In [None]:
#plot_scatter_matrix(df = closing_data,
 #                   savefig = "Scatter Plot of Closing Data.pdf",
  #                  tickerlist = tickers, datacolumn = "Adj Close")

In [None]:
def plot_data(df, savefig, label):
    """Plot stock prices"""
    ax = df.plot(grid = True, figsize = (20, 10))
    datemin = np.datetime64(df.index.values[22], 'Y')
    datemax = np.datetime64(df.index.values[-1], 'Y') + np.timedelta64(1, 'Y') 
    ax.set_xlim(datemin, datemax)
    ax.set_xlabel("date", fontsize = 30, fontdict = font)
    ax.set_ylabel("price", fontsize = 30, fontdict = font)
    plt.title("{}".format(label),fontdict = font )
    

    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=25))
# set formatter

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    ax.get_yaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.rc("legend", fontsize=20)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(fontsize = 20)
    plt.savefig(savefig)
    
    
    plt.show()



In [None]:
#plot_data(vix_closing_data,"VIX Adj Close.pdf", "Adj Closing Prices for Volatility Index (VIX)")

#price_data, volume_data


In [None]:
def compute_daily_returns(df, tickerlist):
    newdf = pd.DataFrame(index = df.index)
    for column in df:
      df_temp = (df[column]/df[column].shift())-1
      newdf = newdf.join(df_temp)
      newdf = newdf.dropna()
    for ticker in tickers:
      newdf = newdf.rename(columns = {"{} Adj Close".format(ticker) : "{} Daily Return".format(ticker)})
    returns_data = df.copy()
    returns_data = returns_data.join(newdf)
    return returns_data

def compute_cumulative_returns(df, tickers, final_dataset):
  newdf = pd.DataFrame(index = df.index)
  newdf = (df[:]/df.iloc[0].values)-1
  newdf = newdf.dropna()
  for ticker in tickers:
    newdf = newdf.rename(columns = {"{} Adj Close".format(ticker) : "{} Cumulative Returns".format(ticker)})
  final_dataset = final_dataset.join(newdf)
  return final_dataset


tickers = ["^VIX","^GSPC", "^DJI", "^NYA", "^GDAXI", "^IXIC",
           "^FCHI", "^HSI", "^N225", "^RUT"]

def Kurtosis(data, tickerlist):
    fig, ((ax0, ax1), (ax2, ax3), (ax4, ax5), (ax6, ax7), (ax8, ax9)) = plt.subplots(
        nrows=5, ncols=2, figsize = (20, 16))
    n_bins = 50
    k0= data["{} Daily Return".format(tickerlist[0])].kurtosis()   
    ax0.hist(data["{} Daily Return".format(tickerlist[0])], n_bins, density=True, 
             histtype='bar', label= str(tickerlist[0]))
    ax0.legend(prop={'size': 20})
    ax0.set_title('GSPC Daily Returns Histogram, Kurtosis: {0:.2f}'.format(k0), fontsize = 20)
    k1= data["{} Daily Return".format(tickerlist[1])].kurtosis()
    ax1.hist(data["{} Daily Return".format(tickerlist[1])], n_bins, density=True, 
             histtype='bar', stacked=True, label= str(tickerlist[1]))
    ax1.legend(prop={'size': 20})
    ax1.set_title('DJI Daily Returns Histogram, Kurtosis: {0:.2f}'.format(k1), fontsize = 20)
    k2= data["{} Daily Return".format(tickerlist[2])].kurtosis()
    ax2.hist(data["{} Daily Return".format(tickerlist[2])], n_bins, histtype='bar', 
             stacked=True, label = str(tickerlist[2]))
    ax2.legend(prop={'size': 20})
    ax2.set_title('NYA Daily Return Histogram, Kurtosis: {0:.2f})'.format(k2),fontsize = 20)
    k3= data["{} Daily Return".format(tickerlist[3])].kurtosis()
    ax3.hist(data["{} Daily Return".format(tickerlist[3])], n_bins, density=True, 
             histtype='bar', stacked=True ,label= str(tickerlist[3]))
    ax3.legend(prop={'size': 20})
    ax3.set_title('GDAXI Daily Return Histogram, Kurtosis: {0:.2f}'.format(k3), fontsize = 20)
    k4= data["{} Daily Return".format(tickerlist[4])].kurtosis()
    ax4.hist(data["{} Daily Return".format(tickerlist[4])], n_bins, density=True,
             histtype='bar', stacked=True, label= str(tickerlist[4]))
    ax4.legend(prop={'size': 20})
    ax4.set_title('IXIC Daily Return Histogram, Kurtosis: {0:.2f}'.format(k4), fontsize = 20)
    k5= data["{} Daily Return".format(tickerlist[5])].kurtosis()
    ax5.hist(data["{} Daily Return".format(tickerlist[5])], n_bins, histtype = "bar", 
             stacked=True, label= str(tickerlist[5]))
    ax5.legend(prop={'size': 20})
    ax5.set_title('FCHI Daily Return Histogram, Kurtosis: {0:.2f}'.format(k5), fontsize = 20)
    k6= data["{} Daily Return".format(tickerlist[6])].kurtosis()
    ax6.hist(data["{} Daily Return".format(tickerlist[6])], n_bins, density=True,
             histtype='bar', label= str(tickerlist[6]))
    ax6.legend(prop={'size': 20})
    ax6.set_title('HSI Daily Return Histogram, Kurtosis: {0:.2f}'.format(k6), fontsize = 20)
    k7= data["{} Daily Return".format(tickerlist[7])].kurtosis()
    ax7.hist(data["{} Daily Return".format(tickerlist[7])], n_bins, density=True,
             histtype='bar', stacked=True, label= str(tickerlist[7]))
    ax7.legend(prop={'size': 20})
    ax7.set_title('N225 Daily Return Histogram, Kurtosis: {0:.2f}'.format(k7), fontsize = 20)
    k8 = data["{} Daily Return".format(tickerlist[8])].kurtosis()
    ax8.hist(data["{} Daily Return".format(tickerlist[8])], n_bins, histtype='bar', 
             stacked=True, label= str(tickerlist[8]))
    ax8.legend(prop={'size': 20})
    ax8.set_title('RUT Daily Return Histogram, Kurtosis: {0:.2f}'.format(k8), fontsize = 20)

    k9 = data["{} Daily Return".format(tickerlist[9])].kurtosis()
    ax9.hist(data["{} Daily Return".format(tickerlist[9])], n_bins, histtype='bar', 
             stacked=True, label= str(tickerlist[9]))
    ax9.legend(prop={'size': 20})
    ax9.set_title('RUT Daily Return Histogram, Kurtosis: {0:.2f}'.format(k9), fontsize = 20)

    #Plot Histogram
    num_bins = 50
    fig.tight_layout()
    plt.savefig("Return Histograms.pdf")
    plt.show()

    #Compute kurtosis
      

#########################################################
tickers = ["^VIX","^GSPC", "^DJI", "^NYA", "^GDAXI", "^IXIC",
           "^FCHI", "^HSI", "^N225", "^RUT"]

returns_data = compute_daily_returns(closing_data, tickers)
returns_data = compute_cumulative_returns(closing_data, tickers, returns_data)
returns_data

#######################################################
def plot_return_data(df, savefig, label, title):
    """Plot stock prices"""
    ax = df.plot(grid = True, figsize = (20, 10), cmap = "Spectral")
    datemin = np.datetime64(df.index.values[22], 'Y')
    datemax = np.datetime64(df.index.values[-1], 'Y') + np.timedelta64(1, 'Y') 
    ax.set_xlim(datemin, datemax)
    ax.set_xlabel("Date", fontsize = 30, fontdict = font)
    ax.set_ylabel("Return Value", fontsize = 30, fontdict = font)
    plt.title("{}".format(title),fontdict = font )
    

    ax.xaxis.set_major_locator(mdates.MonthLocator(interval= 54))
# set formatter

    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.xticks(rotation = 0)
    plt.rc("legend", fontsize=20)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left",fontsize = 20)
    plt.savefig(savefig)
    
    
    plt.show()
plot_return_data(returns_data.iloc[:,10:-11], "Stock Daily Returns.pdf", 
                 label = tickers, 
                 title = "Plot of Daily Returns for Selected Stock Indices"),\
                 plot_return_data(returns_data.iloc[:,-10:-2], 
                                  "Stock Cumulative Returns.pdf", 
                                  label = tickers, title =\
                                  "Plot of Cumulative Returns for Selected Stock Indices")

plot_autocorrelation(returns_data, "Autocorrelation of Daily Returns.pdf", tickers, "Daily Return")

Kurtosis(returns_data, tickers) 
returns_data

In [None]:
plot_scatter_matrix(returns_data, "Scatter Matrix of Daily Returns.pdf", tickers, "Daily Return")


In [None]:
#Plot data        
font = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 40,
        }

################################################################################

#DEFINE ROLLING STATISTICS FUNCTIONS:

def get_MA(values, windows):
  for window in windows:
    returns_data["^GSPC_{}d_MA".format(str(window))] = pd.DataFrame.rolling(
        values, window).mean() #rolling mean

def get_rolling_std(values, windows):
  for window in windows:
    returns_data["^GSPC_{}d_RSTD".format(str(window))] = pd.DataFrame.rolling(
        values, window).std() #rolling std

def get_bollinger_bands(windows):#bbands
    for window in windows:
      returns_data["upper_band_{}d".format(window)] = returns_data["^GSPC_{}d_MA".format(window)] + 20 * returns_data["^GSPC_{}d_RSTD".format(window)]
      returns_data["lower_band_{}d".format(window)] = returns_data["^GSPC_{}d_MA".format(window)] - 20 * returns_data["^GSPC_{}d_RSTD".format(window)]

def get_EMA(values, windows):
  for window in windows:
    returns_data["^GSPC_{}d_EMA".format(
        str(window))] = returns_data["^GSPC Adj Close"].ewm(
            span = window, adjust = False).mean() # exponential moving average

def get_max(values, windows):
  for window in windows:
    returns_data["^GSPC_{}d_max".format(str(window))] =\
     pd.DataFrame.rolling(values, window).max() #rolling mean

def get_min(values, windows):
  for window in windows:
    returns_data["^GSPC_{}d_min".format(str(window))] =\
     pd.DataFrame.rolling(values, window).min() #rolling mean

def get_OSCP(values):
  returns_data["^GSPC_OSCP"] =\
   (pd.DataFrame.rolling(values, 5).mean() -\
    pd.DataFrame.rolling(values, 10).mean())/pd.DataFrame.rolling(values, 
                                                                  5).mean()
def get_MACD(values):
  returns_data["^GSPC_MACD"] =\
  values.ewm(span = 12, adjust = False).mean() -\
  values.ewm(span = 26, adjust = False).mean()

def get_signal_line(values):
    returns_data["^GSPC signal line"] = values.ewm(
        span = 9, adjust = False).mean() # exponential moving average

################################################################################

#COMPUTE ROLLING STATISTICS#

get_MA(returns_data["^GSPC Adj Close"],[3, 5, 10, 21]) 

#MA5, MA10, MA60, MA120 
#1 week, 2 weeks, monthly, quarterly, half-year
 
get_rolling_std(returns_data["^GSPC Adj Close"], [3, 5, 10, 21])

get_bollinger_bands([3, 5, 10, 21])

get_EMA(returns_data["^GSPC Adj Close"], [3, 5, 10, 21])

get_max(returns_data["^GSPC Adj Close"], [3, 5, 10, 21])
#
get_min(returns_data["^GSPC Adj Close"], [3, 5, 10, 21])

get_MACD(returns_data["^GSPC Adj Close"])

get_signal_line(returns_data["^GSPC_MACD"])

################################################################################

#PLOT ROLLING STATISTICS#

#BOLLINGER BANDS
def plot_bbands(bbands, label):
    ax = returns_data["^GSPC Adj Close"].plot(
        title = "GSPC Monthly Bollinger Band Plot", label = label,
        figsize = (20, 10), color = "black")
    plt.title("GSPC Monthly Bollinger Band Plot",fontdict = font)

    bbands[0].plot(label =  "upper band", ax = ax, color = "orchid")
    bbands[1].plot(label = "lower band", ax = ax, color = "cornflowerblue")
    ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    ax.set_xlabel("Date",  fontsize = 40, fontdict = font)
    ax.set_ylabel("Price", fontsize = 40, fontdict = font)
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left", fontsize = 20)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=54)) # set formatter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) 
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') + np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax) 
    plt.xticks(rotation = 0)

    plt.savefig("Monthly Bollinger Bands.pdf")
    plt.show()

plot_bbands(bbands = [returns_data["upper_band_10d"] ,returns_data["lower_band_10d"]], label = "^GSPC")

#MOVING AVERAGE  

def plot_MA(rm, label):
    ax = returns_data["^GSPC Adj Close"].plot(
        title = "GSPC Monthly Moving Average", label = label,
         figsize = (20, 10), color = "black")
    plt.title("GSPC Monthly Moving Average", fontdict = font)
    rm.plot(label = "Moving Average", ax = ax,
            color = "cornflowerblue", alpha = 1) #p
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left", fontsize = 20)
    ax.get_yaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=54))
# set formatter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
# set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') +\
     np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax) 
    plt.xticks(rotation = 0)

    plt.savefig("Monthly Moving Average.pdf")
    plt.show()


plot_MA(rm = returns_data["^GSPC_21d_MA"], label = "^GSPC")

#EXPONENTIAL MOVING AVERAGE PLOT
def plot_EMA(rm, label):
    ax = returns_data["^GSPC Adj Close"].plot(
        title = "GSPC Exponentially Moving Average", label = label,
        figsize = (20, 10), color = "black")
    plt.title("GSPC Monthly Exponentially Moving Average", fontdict = font)
    rm.plot(label = "Exponential Moving Average", ax = ax,
            color = "cornflowerblue", alpha = 1) #p
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    
    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left", fontsize = 20)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 54)) 
    # set formatter
    ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) 
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') +\
     np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax) 
    plt.xticks(rotation = 0)
    plt.savefig("Monthly Exponential Moving Average.pdf")
    plt.show()

plot_EMA(rm = returns_data["^GSPC_21d_EMA"], label = "^GSPC")

#ROLLING MAX CLOSE

def plot_max(rm, label):
    ax = returns_data["^GSPC Adj Close"].plot(
        title = "GSPC Monthly Rolling Max ", label = label,
        figsize = (20, 10), color = "black")
    plt.title("GSPC Monthly Rolling Max", fontdict = font)
    rm.plot(label = "Monthly Rolling Max", ax = ax,
            color = "#9a0200", alpha = 1) #p
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    
    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left", fontsize = 20)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 54)) 
    # set formatter
    ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) 
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') +\
     np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax)
    plt.xticks(rotation = 0) 

    plt.savefig("Monthly Rolling Max.pdf")
    plt.show()

plot_max(rm = returns_data["^GSPC_21d_max"], label = "^GSPC")

#OSCP (MOMENTUM INDICATOR) 


#OSCP (MOMENTUM INDICATOR) 

def plot_MACD(rm,label):
    ax = returns_data["^GSPC_MACD"].plot(
        title = "GSPC Moving Average Convergence/Divergence ", label = label,
        figsize = (20, 10), color ="black")
    plt.title("GSPC Moving Average Convergence/Divergence", fontdict = font)
    rm.plot(label = "Signal Line", ax = ax,
            color = "#9a0200", alpha = 1) 
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("MACD Values", fontsize = 30, fontdict = font)
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    
    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    ax.legend(loc = "upper left", fontsize = 20)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 54)) 
    # set formatter
    # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') +\
     np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax) 
    plt.xticks(rotation = 0)
    plt.savefig("GSPC MACD PLOT.pdf")
    plt.show()

plot_MACD(returns_data["^GSPC signal line"],"^GSPC MACD")




In [None]:
def computeRSI(data, time_window):
    diff = data.diff(1).dropna()        # diff in one field(one day)
    #this preservers dimensions off diff values
    up_chg = 0 * diff
    down_chg = 0 * diff
    
    # up change is equal to the positive difference, otherwise equal to zero
    up_chg[diff > 0] = diff[ diff>0 ]
    
    # down change is equal to negative deifference, otherwise equal to zero
    down_chg[diff < 0] = diff[ diff < 0 ]
    
    # check pandas documentation for ewm
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
    # values are related to exponential decay
    # we set com=time_window-1 so we get decay alpha=1/time_window
    up_chg_avg   = up_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
    down_chg_avg = down_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
    
    rs = abs(up_chg_avg/down_chg_avg)
    rsi = 100 - 100/(1+rs)
    return rsi


def computeSO(close_data, low_data, high_data, n):

  KPct = ((close_data - pd.DataFrame.rolling(low_data, n).min())/(pd.DataFrame.rolling(high_data, n).max() - pd.DataFrame.rolling(low_data, n).min()))*100
  DPct = pd.DataFrame.rolling(KPct,3).mean()
  return KPct, DPct

def computeWRPct(close_data, low_data, high_data, n):
    RPct = ((pd.DataFrame.rolling(high_data, n).max() - close_data)/(pd.DataFrame.rolling(high_data, n).max() - pd.DataFrame.rolling(low_data, n).min()))*-100
    return RPct



In [None]:
returns_data["^GSPC_14d_RSI"] = computeRSI(returns_data["^GSPC Adj Close"], time_window = 14)

returns_data["^GSPC_14d_SO%K"], returns_data["^GSPC_14d_SO%D"]= computeSO(returns_data["^GSPC Adj Close"], price_data["^GSPC Low"], price_data["^GSPC High"], 14)

returns_data["^GSPC_14d_WR%"] = computeWRPct(returns_data["^GSPC Adj Close"], price_data["^GSPC Low"], price_data["^GSPC High"], 14)

returns_data

In [None]:
def plot_RSI(RSI, label):
    ax = RSI.plot(title = "GSPC", label = label,
                 figsize = (20, 10), color = "#5079bc")
    plt.title("GSPC 14-Day Relative Strength Index", fontdict = font)
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Relative Strength Value", fontsize = 30, fontdict = font)
    ax.legend(loc = "upper left", fontsize = 20)
    
    
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    
    
    
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 56)) # set formatter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    datemin = np.datetime64(returns_data.index.values[0], 'Y')
    datemax = np.datetime64(returns_data.index.values[-1], 'Y') + np.timedelta64(1, 'Y')
    ax.set_xlim(datemin, datemax) 
    plt.xticks(rotation = 0)
    plt.axhline(0, linestyle='--', alpha=0.1, color = "black")
    plt.axhline(20, linestyle='--', alpha=0.5, color = "black")
    plt.axhline(30, linestyle='--', color = "black")

    plt.axhline(70, linestyle='--', color = "black")
    plt.axhline(80, linestyle='--', alpha=0.5, color = "black")
    plt.axhline(100, linestyle='--', alpha=0.1, color = "black")
    plt.savefig("GSPC Relative Strength.pdf")

    plt.show()

plot_RSI(returns_data["^GSPC_14d_RSI"], label = "Relative Strength Index for ^GSPC")

# plot corresponding RSI values and significant levels

plt.show()


################

def plot_SO(SO, label):
    ax = SO.plot(y = ["^GSPC_14d_SO%K", "^GSPC_14d_SO%D"], figsize = (20, 10),
                 color = ["cornflowerblue", "orchid"], label = label)
    plt.title("GSPC Stochastic Oscillator (Momentum Measure)", fontdict = font)
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    ax.legend(fontsize = 20,)
    
    
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    
    
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 52)) # set formatter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.xticks(rotation = 0)
    plt.show()
    plt.savefig("GSPC Stochastic Oscillator.pdf")

plot_SO(returns_data, label = ["^GSPC 21-Day %K",  "^GSPC 21-Day %D"])

# plot corresponding RSI values and significant levels

plt.show()

###########################
def plot_WPct(SO, label):
    ax = SO.plot(y = ["^GSPC_14d_WR%"], figsize = (20, 10),color = ["#5079bc"], label = label)
    plt.title("GSPC William's R% (Momentum Measure)", fontdict = font)
    ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
    ax.set_ylabel("Price", fontsize = 30, fontdict = font)
    ax.legend(fontsize = 20,)
    
    
    plt.rc("legend", fontsize = 30)
    plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=20)
    
    
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 52)) # set formatter
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
    plt.gcf().autofmt_xdate()
    plt.xticks(rotation = 0)
    
    plt.show()
    plt.savefig("GSPC William's R%.pdf")

plot_WPct(returns_data, label = ["^GSPC 14-Day William's R%"])

# plot corresponding RSI values and significant levels

plt.show()


In [None]:
returns_data

In [None]:

#Ease of Movement 
def compute_EVM(ndays): 
 dm = ((price_data['^GSPC High'] + price_data['^GSPC Low'])/2) - ((price_data['^GSPC High'].shift(1) + price_data['^GSPC Low'].shift(1))/2)
 br = (volume_data['^GSPC Volume'] / 100000000) / ((price_data['^GSPC High'] - price_data['^GSPC Low']))
 EVM = dm / br 
 EVM_MA = pd.Series(EVM.rolling(ndays).mean(), name = 'EVM') 
 returns_data["^GSPC_{}d_EVM".format(ndays)] = EVM_MA  
 
# Retrieve the AAPL data from Yahoo finance:

# Compute the 14-day Ease of Movement for AAPL
compute_EVM(14)

# Plotting the Price Series chart and the Ease Of Movement below

def plot_EVM(data):
  fig = plt.figure(figsize=(20,10))
  ax = fig.add_subplot(2, 1, 1)
  ax.set_xticklabels([])
  plt.plot(returns_data['^GSPC Adj Close'],lw=1, label = "GSPC")
  plt.title('GSPC 14-Day Ease of Movement Plot', fontdict = font)  
  plt.ylabel('close price')
  plt.grid(True)
  bx = fig.add_subplot(2, 1, 2)
  plt.plot(data,'#5079bc',lw=0.75,linestyle='-')
  plt.legend(loc=2,prop={'size':9}, fontsize = 20)
  plt.grid(True)
  plt.setp(plt.gca().get_xticklabels(), rotation=0)
  ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
  ax.set_ylabel("Price", fontsize = 30, fontdict = font)
  ax.legend(fontsize = 20,)

  plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
  plt.rc('ytick', labelsize=20)


  ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 25)) # set formatter
  ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
  plt.gcf().autofmt_xdate()
  ax.get_yaxis().set_major_formatter(
  matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
  bx.set_xlabel("Date",  fontsize = 30, fontdict = font)
  bx.set_ylabel("EMV Values", fontsize = 30, fontdict = font)
  bx.legend(fontsize = 20,)



  plt.rc("legend", fontsize = 30)
  plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
  plt.rc('ytick', labelsize=20)


  bx.xaxis.set_major_locator(mdates.MonthLocator(interval = 56)) # set formatter
  bx.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
  plt.gcf().autofmt_xdate()
  plt.xticks(rotation = 0)

  plt.savefig("GSPC Ease of Movement.pdf")

  
  

  plt.show()

plot_EVM(returns_data["^GSPC_14d_EVM"])

In [None]:
# Rate of Change code
# Rate of Change (ROC)
def ROC(data,n):
 N = data['^GSPC Adj Close'].diff(n)
 D = data['^GSPC Adj Close'].shift(n)
 ROC = pd.Series(N/D,name='Rate of Change')
 data = data.join(ROC)
 return data 
 
def plot_ROC(data):
  fig = plt.figure(figsize=(20,10))
  ax = fig.add_subplot(3, 1, 1)
  ax.set_xticklabels([])
  plt.plot(returns_data['^GSPC Adj Close'],lw=1, label = "GSPC")
  plt.title('GPSC 5-Day Rate of Change (ROC) Plot', fontdict = font)  
  plt.ylabel('Close Price')
  plt.grid(True)
  bx = fig.add_subplot(3, 1, 2)
  plt.plot(data,'#5079bc',lw=0.75,linestyle='-',label='GSPC 5-Day Rate of Change')
  plt.legend(loc=1,prop={'size':10}, fontsize = 20)
  plt.grid(True)
  plt.setp(plt.gca().get_xticklabels(), rotation=0)
  ax.set_xlabel("Date",  fontsize = 30, fontdict = font)
  ax.set_ylabel("Price", fontsize = 30, fontdict = font)
  ax.legend(fontsize = 20,)
  
  plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
  plt.rc('ytick', labelsize=20)

  cx = fig.add_subplot(3, 1, 3)
  plt.plot(returns_data["^GSPC_14d_EVM"],'#5079bc',lw=0.75,linestyle='-',label='GSPC 14-Day EVM')
  plt.legend(loc=1,prop={'size':10}, fontsize = 20)
  plt.grid(True)
  ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 25)) # set formatter
  ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
  plt.gcf().autofmt_xdate()
  ax.get_yaxis().set_major_formatter(
  matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
  bx.set_xlabel("Date",  fontsize = 30, fontdict = font)
  bx.set_ylabel("ROC Vals", fontsize = 30, fontdict = font)
  bx.legend(fontsize = 20,)
  cx.set_xlabel("Date",  fontsize = 30, fontdict = font)
  cx.set_ylabel("EVM Vals", fontsize = 30, fontdict = font)
  cx.legend(fontsize = 20,)


  plt.rc("legend", fontsize = 30)
  plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
  plt.rc('ytick', labelsize=20)
  cx.xaxis.set_major_locator(mdates.MonthLocator(interval = 54)) # set formatter
  cx.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  

  bx.xaxis.set_major_locator(mdates.MonthLocator(interval = 54)) # set formatter
  bx.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y')) # set font and rotation for date tick labels  
  plt.gcf().autofmt_xdate()
  plt.xticks(rotation = 0)

  plt.savefig("GSPC Rate of Change & Close.pdf")

  
  

  plt.show()
# Compute the 5-period Rate of Change for NIFTY

GSPC_5d_ROC = ROC(returns_data,5)
returns_data["^GSPC_5d_ROC"] = GSPC_5d_ROC['Rate of Change']

plot_ROC(returns_data["^GSPC_5d_ROC"])

# Plotting the Price Series chart and the Ease Of Movement below


In [None]:
def compute_ForceIndex(n): 
    FI = pd.Series(returns_data['^GSPC Adj Close'].diff(n) * volume_data['^GSPC Volume'], name = 'ForceIndex')
    returns_data["^GSPC_{}d_FI".format(n)] = FI 

# Compute the Force Index
compute_ForceIndex(5)


In [None]:
#Moving Average (5, 10, 21, 63, 126, 252)
#Exponential Moving Average (5, 10, 21, 63, 126, 252)
#Rolling Std and Bollinger Bands (5, 10, 21, 63, 126, 252)
#Relative Strength Index (21)
#Stochastic Oscillator(252 days)
#William's R% (252 days)
#Rate of Change (5 days)
#Force Index (5 days)
#Ease of Movement (EVM)
#OSCP
  #Moving Average Convergence Divergence (12-26 day)
#returns_data = returns_data.iloc[252:, :]

#returns_dat

returns_data


In [None]:
tickers = ["^GSPC", "^DJI", "^NYA", "^VIX","^IXIC","^GDAXI",
           "^FCHI", "^HSI", "^N225", "^RUT"]

for ticker in tickers[0:5]:
  returns_data["{}_DailyReturn_t1".format(ticker)] = np.nan
  returns_data["{}_DailyReturn_t2".format(ticker)] = np.nan
  returns_data["{}_DailyReturn_t3".format(ticker)] = np.nan

for ticker in tickers[5:]:
  returns_data["{}_DailyReturn_t".format(ticker)] = np.nan
  returns_data["{}_DailyReturn_t1".format(ticker)] = np.nan
  returns_data["{}_DailyReturn_t2".format(ticker)] = np.nan   


for ticker in tickers[0:5]:
  for i in range(0, len(returns_data)):
    returns_data["{}_DailyReturn_t1".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i-1]
    returns_data["{}_DailyReturn_t2".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i-2]
    returns_data["{}_DailyReturn_t3".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i-3]    

for ticker in tickers[5:]:
  for i in range(0, len(returns_data)):   
    returns_data["{}_DailyReturn_t".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i]
    returns_data["{}_DailyReturn_t1".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i-1]
    returns_data["{}_DailyReturn_t2".format(ticker)].iloc[i] =\
    returns_data["{} Daily Return".format(ticker)].iloc[i-2]     

returns_data


In [None]:
def get_RMedian(values, windows):
  for window in windows:
    returns_data["^GSPC_Returns_{}d_RMedian".format(str(window))] = pd.DataFrame.rolling(
        values, window).mean() #rolling mean
get_RMedian(returns_data["^GSPC Daily Return"], [21])

returns_data["^GSPC Return Direction"] = 0
returns_data.loc[
                 returns_data["^GSPC Daily Return"] >= 0, 
                 "^GSPC Return Direction"] = 1
returns_data[["^GSPC Daily Return", "^GSPC Return Direction"]]



In [None]:
returns_data.isna().sum()
#raw_data = returns_data.iloc[:, :20]
#returns_data = returns_data.iloc[:,20:]
unprocessed_data = returns_data.iloc[:, :30]
preprocessed_data = returns_data.iloc[:, 30:]
unprocessed_data = unprocessed_data.dropna()
preprocessed_data =  preprocessed_data.dropna()

preprocessed_data

In [None]:
final_data = preprocessed_data.copy()
#final_data = final_data.reset_index(drop = True)

In [None]:
final_data 

In [None]:
tmp = pd.DataFrame()
tmp =tmp.dropna()
tmp["Date"] = returns_data.index 
tmp["^GSPC Adj Close"] = returns_data["^GSPC Adj Close"].values
data_FT = tmp[['Date', '^GSPC Adj Close']]

close_fft = np.fft.fft(np.asarray(data_FT['^GSPC Adj Close'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
plt.figure(figsize=(20, 10), dpi=100)
fft_list = np.asarray(fft_df['fft'].tolist())
for num_ in [3, 6, 9, 100]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    plt.plot(np.fft.ifft(fft_list_m10), label='Fourier transform with {} components'.format(num_))
plt.plot(data_FT['^GSPC Adj Close'],  label="GSPC")
plt.xlabel('Days', fontsize = 30)
plt.ylabel('Price', fontsize = 30)
plt.title('GSPC Closing Prices & Fourier Transforms', fontdict = font)
plt.legend()
plt.show()




In [None]:
from collections import deque
items = deque(np.asarray(fft_df['absolute'].tolist()))
items.rotate(int(np.floor(len(fft_df)/2)))
plt.figure(figsize=(10, 7), dpi=80)
plt.stem(items)
plt.title('Figure 4: Components of Fourier transforms')
plt.show()

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from pandas import DataFrame
from pandas import datetime

series = data_FT['^GSPC Adj Close']
model = ARIMA(series, order=(10, 1, 2))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
Xseries = series.values
size = int(len(Xseries) * 0.66)
train, test = Xseries[0:size], Xseries[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(10,1,2))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)

Models


# Model Implementations and Metrics!

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))



In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X,y = final_data.iloc[:,:-1],final_data.iloc[:,-1:] 
tscv = TimeSeriesSplit()
print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=5)
for train_index, test_index in tscv.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

my_scaler = MinMaxScaler(feature_range = (0,1))
X_train_preprocessed = pd.DataFrame(my_scaler.fit_transform(X_train))
X_test_preprocessed = pd.DataFrame(my_scaler.transform(X_test))
 
X_train_preprocessed.columns = X_train.columns
X_test_preprocessed.columns = X_test.columns
# correlation_dataset = X_train_preprocessed.append) 
X_train_preprocessed["^GSPC Return Direction"] = y_train.iloc[:,0].values



# pd.set_option('display.max_rows', X_train_preprocessed.shape[0]+1)

correlations = pd.DataFrame(X_train_preprocessed.corr()["^GSPC Return Direction"][:68])

del X_train_preprocessed["^GSPC Return Direction"]

relevant_features = list(correlations[abs(correlations["^GSPC Return Direction"])>0.05].index)

X_train_preprocessed_engineering = X_train_preprocessed[relevant_features] 

X_test_preprocessed_engineering = X_test_preprocessed[relevant_features]


In [None]:
rbfclf = sk.svm.SVC(kernel = "rbf", max_iter = 5000)

rbfclf.fit(X_train_preprocessed_engineering, y_train)

rbfprediction = rbfclf.predict(X_test_preprocessed_engineering)


rbfaccuracy = rbfclf.score(X_test_preprocessed_engineering, y_test)
rbf_f1_score = sk.metrics.f1_score(y_test, rbfprediction)
rbf_baccuracy = sk.metrics.balanced_accuracy_score(y_test, rbfprediction)
rbf_precision = sk.metrics.precision_score(y_test ,rbfprediction)
rbf_recall = sk.metrics.recall_score(y_test, rbfprediction)

In [None]:
rbf_f1_score, rbf_precision, rbfaccuracy, rbf_recall

In [None]:

def input_fn(features, labels, batch_size=252):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle and repeat if you are in training mode.'
    return dataset.batch(batch_size)

my_feature_columns = []
for key in X_train_preprocessed_engineering.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

    
BRFClassifier = tf.estimator.BoostedTreesClassifier(my_feature_columns, n_batches_per_layer = 1,  
  n_classes = 2, learning_rate = 0.115, n_trees = 80, max_depth = 8)
BRFClassifier.train(input_fn = lambda: input_fn(X_train_preprocessed_engineering, y_train), steps = 5000)

In [None]:
BRFeval_result = BRFClassifier.evaluate(input_fn = lambda: input_fn(X_test_preprocessed_engineering,
                                                y_test))
print("\nTest Set Accuracy: {}.\n".format(BRFeval_result))

y_pred = BRFClassifier.predict(X_test_preprocessed_engineering)


In [None]:
tmp = final_data.rename(columns={x: str(y) for x,y in zip(final_data.columns,range(0,len(final_data.columns)))})
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X,y = tmp.iloc[:,:-1],tmp.iloc[:,-1:] 
tscv = TimeSeriesSplit()
print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=5)
for train_index, test_index in tscv.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

my_scaler = MinMaxScaler(feature_range = (0,1))
X_train_preprocessed = pd.DataFrame(my_scaler.fit_transform(X_train))
X_test_preprocessed = pd.DataFrame(my_scaler.transform(X_test))
 
X_train_preprocessed.columns = X_train.columns
X_test_preprocessed.columns = X_test.columns
# correlation_dataset = X_train_preprocessed.append) 
X_train_preprocessed["^GSPC Return Direction"] = y_train.iloc[:,0].values



# pd.set_option('display.max_rows', X_train_preprocessed.shape[0]+1)

correlations = pd.DataFrame(X_train_preprocessed.corr()["^GSPC Return Direction"][:68])

del X_train_preprocessed["^GSPC Return Direction"]

relevant_features = list(correlations[abs(correlations["^GSPC Return Direction"])>0.05].index)

X_train_preprocessed_engineering = X_train_preprocessed[relevant_features] 

X_test_preprocessed_engineering = X_test_preprocessed[relevant_features]

X_train_preprocessed_engineering.columns

In [None]:
def input_fn(features, labels, batch_size=252):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle and repeat if you are in training mode.'
    return dataset.batch(batch_size)

my_feature_columns = []
for key in X_train_preprocessed_engineering.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

result = []
DNNClassifier = tf.estimator.DNNClassifier(
    hidden_units = [5,10], n_classes = 2, feature_columns = my_feature_columns,
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.03))

DNNClassifier.train(input_fn = lambda: input_fn(X_train_preprocessed_engineering, y_train), steps = 10000)
DNNeval_result = DNNClassifier.evaluate(input_fn = lambda: input_fn(X_test_preprocessed_engineering, y_test))
result.append(DNNeval_result["accuracy"])

In [None]:
# h =[]
# for i in range(0,len(result),60):
#   avgacc = sum(result[i:i+60])/60
#   h.append(avgacc)
# print(sorted(h))
# print(h)

# [[5,5], [5,10], [6,4], [10,10]]

result

print("DNNClassifier([5,10]) Accuracy: {}%".format(round(result[0]*100)))


In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# X,y = final_data.iloc[:,:-1].values, final_data.iloc[:,-1:].values 
# tscv = TimeSeriesSplit()
# print(tscv)
# TimeSeriesSplit(max_train_size=None, n_splits=5)
# for train_index, test_index in tscv.split(X):
#   print("TRAIN:", train_index, "TEST:", test_index)
#   X_train, X_test = X[train_index], X[test_index]
#   y_train, y_test = y[train_index], y[test_index]

tmp = final_data.rename(columns={x: str(y) for x,y in zip(final_data.columns,range(0,len(final_data.columns)))})
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X,y = tmp.iloc[:,:-1],tmp.iloc[:,-1:] 
tscv = TimeSeriesSplit()
print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=5)
for train_index, test_index in tscv.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

my_scaler = MinMaxScaler(feature_range = (0,1))
X_train_preprocessed = pd.DataFrame(my_scaler.fit_transform(X_train))
X_test_preprocessed = pd.DataFrame(my_scaler.transform(X_test))

X_train_preprocessed.columns = X_train.columns
X_test_preprocessed.columns = X_test.columns
# correlation_dataset = X_train_preprocessed.append) 
X_train_preprocessed["^GSPC Return Direction"] = y_train.iloc[:,0].values



# pd.set_option('display.max_rows', X_train_preprocessed.shape[0]+1)

# correlations = pd.DataFrame(X_train_preprocessed.corr()["^GSPC Return Direction"][:68])

# del X_train_preprocessed["^GSPC Return Direction"]

relevant_features = list(correlations[abs(correlations["^GSPC Return Direction"])>0.05].index)

X_train_preprocessed_engineering = X_train_preprocessed[relevant_features] 

X_test_preprocessed_engineering = X_test_preprocessed[relevant_features]

X_test_preprocessed_engineering, X_train_preprocessed_engineering, y_test, y_train = X_test_preprocessed_engineering.values, X_train_preprocessed_engineering.values, y_test.values, y_train.values


X_train = X_train_preprocessed_engineering.reshape((X_train_preprocessed_engineering.shape[0], 1, X_train_preprocessed_engineering.shape[1]))
X_test = X_test_preprocessed_engineering.reshape((X_test_preprocessed_engineering.shape[0], 1, X_test_preprocessed_engineering.shape[1]))
print(X_train_preprocessed_engineering.shape, y_train.shape, X_test_preprocessed_engineering.shape, y_test.shape)



In [None]:
model=tf.keras.Sequential([
                           tf.keras.layers.Dense(units = 10, activation = "relu", input_shape = [13]),
                           tf.keras.layers.Dropout(rate=0.2),
                           tf.keras.layers.Dense(10, activation='relu'),
                           tf.keras.layers.Dropout(rate=0.2),
                           tf.keras.layers.Dense(5, activation='relu'),
                           tf.keras.layers.Dropout(rate=0.2), 
                          #  tf.keras.layers.BatchNormalization(),
                           tf.keras.layers.Dense(2, activation = "softmax")
                           ])

opt = tf.keras.optimizers.Adagrad(learning_rate = .01)
model.compile(optimizer=opt, loss='mae', metrics = ["acc"])
history = model.fit(X_train_preprocessed_engineering, y_train, 
          batch_size = 10, epochs = 50, shuffle = False, validation_data = (X_test_preprocessed_engineering, y_test), verbose = 0)
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', "val_loss"]].plot()
history_df.loc[:, ["acc", "val_acc"]].plot()




In [None]:
history_df.iloc[49,]


In [None]:


model=tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 10, input_shape = [1,13],
                                activation = "relu"))
model.add(tf.keras.layers.Dense(units = 5, activation = "relu"))
model.add(tf.keras.layers.Dense(units = 2, activation="softmax"))
opt = tf.keras.optimizers.Adam(learning_rate = 0.1)
model.compile(optimizer = opt, 
              loss = "mse")
#out_batch = NBatchLogger(display=1000)
model.fit(X_train, y_train, 
          batch_size = 21, epochs = 100, shuffle = True)


In [None]:
pd.DataFrame(X_train_preprocessed_engineering).shape

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential()

model.add(LSTM(units = 10, return_sequences = True,recurrent_activation="tanh"))
model.add(Dropout(0.2))

model.add(LSTM(units = 10, return_sequences = True, recurrent_activation= "tanh"))
model.add(Dropout(0.2))

model.add(Dense(units = 2, activation = "softmax"))
opt = tf.keras.optimizers.Adagrad(learning_rate = 0.01)
model.compile(optimizer = opt, loss = 'mse', metrics = ["accuracy"])



model.fit(X_train_preprocessed_engineering , y_train, epochs = 50, batch_size = 20, shuffle = True)

predicted_stock_price = model.predict(X_test_preprocessed_engineering)


m = tf.keras.metrics.Accuracy()

In [None]:
m.update_state(y_test, predicted_stock_price)
predicted_stock_price

In [None]:
import pandas as pd
import numpy as np
import os
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Activation
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l1_l2
opt = tf.keras.optimizers.Adam(learning_rate = 0.01)

class Double_Tanh(Activation):
    def __init__(self, activation, **kwargs):
        super(Double_Tanh, self).__init__(activation, **kwargs)
        self.__name__ = 'double_tanh'

def double_tanh(x):
    return (K.tanh(x) * 2)

get_custom_objects().update({'double_tanh':Double_Tanh(double_tanh)})


# design network
model = Sequential()
model.add(LSTM(25, input_shape=(X_train.shape[1], X_train.shape[2])))

model.add(Activation(double_tanh))
model.add(tf.keras.layers.Dense(units = 1, activation = "softmax"))

model.compile(loss="categorical_crossentropy", optimizer= opt,
metrics=["mse", "mae", "accuracy"])
# fit network
history = model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test),
                    verbose=2, shuffle=False)
# plot history
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

In [None]:
model = Sequential()
model.add(LSTM(units=3, return_sequences = True,
               input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=2, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units=3, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units=2, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(units =1))
model.compile(optimizer="adam",loss="mean_squared_error", metrics = ["accuracy",f1_m,precision_m, recall_m])
model.fit(X_train, y_train, epochs = 50, batch_size = 32, verbose=2)


predicted_stock_price = model.predict(X_test)



In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# fit the model
history = model.fit(X_train, y_train, validation_split=0.3, epochs=10, verbose=0)

# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=2)

In [None]:
#write up start with exploratory data analysis and methods 
#shift data for your own stocks (consu;t 6)
#split data
#train random forest, neural networks, svm, and arima
#table results
