In [1]:
import pandas as pd
import numpy as np

import os
from os import listdir

import annoy
from ast import literal_eval
from tqdm import tqdm

import time
from datetime import datetime, timedelta, date
from textwrap import dedent

import matplotlib.pyplot as plt
# %matplotlib inline

from statsmodels.tsa.api import VAR
# from statsmodels.tsa.stattools import adfuller
# from statsmodels.tools.eval_measures import rmse, aic

import random

  import pandas.util.testing as tm


In [2]:
def annoy_build(n=1000):
    global df_comb
    data = df_comb[-n:]
    Stocks = data.columns[1:len(data.columns)].to_numpy()
    Vectors = np.array([data.iloc[:,i].to_numpy() for i in range(1,len(data.columns))])

    class AnnoyIndex():
        def __init__(self, vectors, labels):
            self.dimension = len(vectors[0])
            self.vectors = vectors.astype('float32')
            self.labels = labels    
       
        def build(self, number_of_trees=10):
            self.index = annoy.AnnoyIndex(self.dimension,'euclidean')
            for i, vec in enumerate(self.vectors):
                self.index.add_item(i, vec.tolist())
            self.index.build(number_of_trees)
            
        def query(self, stock, k=10):
            vector = self.vectors[int(np.where(self.labels==stock)[0])]
            indices = self.index.get_nns_by_vector(
                  vector.tolist(), 
                  k, 
                  search_k=-1,
                  include_distances=True)
            stocks = [(self.labels[i]) for i in indices[0]]
            
            return [(self.labels[indices[0][i]], indices[1][i]) for i in range(len(indices[0]))]

    index = AnnoyIndex(Vectors, Stocks)
    index.build()
    return index

In [3]:
def annoy_var(searched_stock='AAPL', side='response', date_num=0, num_stocks=10):
    """
    Arguments:
    searched_stock: name of searched stock (string of uppercase letters)
    side: 'impulse' or 'response' (is the searched stock to be treated as the impulse or response?)
    num_stocks: number of related stocks to use in VAR model; default 10
    """
    global df_comb
    global index
    global num_past
    # t1 = time.time()
    annoy_result = index.query(searched_stock, num_stocks)
    # time.time()-t1

    data = df_comb # This is pd.read_csv('stock_prices_pctch.csv')
    data = data.drop('Date', axis=1)

    stocks = [annoy_result[i][0] for i in range(0, len(annoy_result))]

    selected_data = data.iloc[date_num+1-num_past:date_num+1][stocks]
    def estimate_var(df, lags=1):
        model = VAR(df)
        fitted_model = model.fit(lags)
        return fitted_model

    fitted_model = estimate_var(selected_data)
    return fitted_model, selected_data

In [4]:
def evaluate(searched_stock='AAPL', date_num=0, days=10, num_stocks=10):
    """
    Arguments:
    searched_stock: name of searched stock (string of uppercase letters).
    date_num: Which day do you want to predict from? Int from 0 to 1 less than number of rows in data.
    days: length of prediction; positive integer.
    """
    global index
    global df_comb
    results, selected_data = annoy_var(searched_stock=searched_stock, date_num=date_num, num_stocks=num_stocks)

    predictions = results.forecast(selected_data.values[-1:], days)
    actual = df_comb[selected_data.columns].values[date_num+1 : date_num+1+days]

#     def trend(x):
#         if x>0:
#             res = 1
#         else:
#             res=0
#         return res
#     vtrend = np.vectorize(trend)
#     predictions = vtrend(predictions)
#     actual = vtrend(actual)

    errors = predictions - actual
    bias = errors.mean() # Are the predictions biased? If not, this should be close to 0.
    avg_error = abs(errors).mean() # Average absolute value of prediction error.

    return avg_error, bias

In [5]:
def gridsearch(Num_Past, Num_Future, Num_Stocks):
    global df
    global index
    global num_past
    global num_future
    global num_stocks
    num_past = Num_Past
    num_future = Num_Future
    num_stocks = Num_Stocks
    index = annoy_build(n=Num_Past)
    data = df_comb
    stocks = data.columns[1:len(data.columns)]
    selected_stocks = random.sample(list(stocks), k=50) # Select 50 random stocks to do evaluation.
    selected_days = random.sample(list(range(num_past, len(data)-num_future)), k=20) # Select 10 random days to predict from and evaluate.
    
    errors = {stock: [] for stock in selected_stocks}
    for stock in selected_stocks:
        for date_num in selected_days:
            errors[stock].extend(evaluate(stock, date_num, days=num_future, num_stocks=num_stocks))
            
    for company in errors:
        errors[company] = [errors[company][i] for i in range(len(errors[company])) if i%2==0]
    errors = pd.DataFrame.from_dict(errors)
    errors = errors.mean().mean()
    return errors


In [6]:
df = pd.read_csv('stock_prices_pctch.csv')
# def binary(x):
#     if x>0:
#         x = 1
#     elif x<0:
#         x = -1
#     else:
#         x = 0
#     return x
# df = pd.concat([df.iloc[:,[0]],df.iloc[:,1:].applymap(binary)],axis=1)
df_rev = -df.iloc[:,1:len(df.columns)] # Reversed sign so that ANNOY can also detect stocks that move opposite of each other.
df_rev.rename(columns=lambda x: x+'_reversed', inplace=True)
df_comb = pd.concat([df, df_rev], axis=1)

In [7]:
Num_Past = [700,800,900]
Num_Future = [3, 5, 10]
Num_Stocks = [5, 7, 9, 10, 15]
# Num_Past = [900]
# Num_Future = [5]
# Num_Stocks = [10]

res = {'Num_Past':[],'Num_Future':[],'Num_Stocks':[],'errors':[]}
for i in Num_Past:
    for j in Num_Future:
        for k in Num_Stocks:
            res['Num_Past'].append(i)
            res['Num_Future'].append(j)
            res['Num_Stocks'].append(k)
            print(i,j,k)
            errors = gridsearch(i,j,k)
            res['errors'].append(errors)

700 3 5
700 3 7
700 3 9
700 3 10
700 3 15
700 5 5
700 5 7
700 5 9
700 5 10
700 5 15
700 10 5
700 10 7
700 10 9
700 10 10
700 10 15
800 3 5
800 3 7
800 3 9
800 3 10
800 3 15
800 5 5
800 5 7
800 5 9
800 5 10
800 5 15
800 10 5
800 10 7
800 10 9
800 10 10
800 10 15
900 3 5
900 3 7
900 3 9
900 3 10
900 3 15
900 5 5
900 5 7
900 5 9
900 5 10
900 5 15
900 10 5
900 10 7
900 10 9
900 10 10
900 10 15


In [8]:
table2 = pd.DataFrame.from_dict(res)
table2.sort_values(by=['errors'])[:20]

Unnamed: 0,Num_Past,Num_Future,Num_Stocks,errors
34,900,3,15,0.008406
32,900,3,9,0.009114
31,900,3,7,0.009504
39,900,5,15,0.009528
41,900,10,7,0.009707
25,800,10,5,0.009781
44,900,10,15,0.009904
43,900,10,10,0.010049
15,800,3,5,0.010087
28,800,10,10,0.01019


In [9]:
from sklearn.linear_model import LinearRegression
x = table2[['Num_Past','Num_Future','Num_Stocks']].to_numpy()
y = table2[['errors']].to_numpy()
reg = LinearRegression().fit(x, y)
reg.score(x, y)
reg.coef_

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

array([[-1.38828305e-05,  3.76600015e-05, -8.62359324e-05]])