In [22]:
import os
import pandas as pd
import datetime
import numpy as np
import time

## Image data creation

In [None]:
def create_image_files(dir_path, W, forwardperiod, companies_bundle, isTrain):
    str_total = ''
    str_total_Y = ''
    
    for ticker in companies_bundle:
        time1 = time.time()
        ticker0 = ticker+'.csv'
        df = pd.read_csv(os.path.join(dir_path, ticker0))
        df.index = df.Date
        df.index = pd.to_datetime(df.index)

        #Resampling
        df = df.resample('W',loffset=pd.offsets.timedelta(days=-6)).agg({'Date':'first','Open': 'first', 'High': 'max', 'Low': 'min','Adj Close': 'last', 'Volume': 'sum'})

        print('')
        print('Length Dataframe {} is {}'.format(ticker0.replace('.csv',''), len(df)))
        df['NextRets'] = 100*df['Adj Close'].pct_change().shift(-1)

        #Step1: Preprocess data
        df = df.dropna()
        df = quantilize(df)
        
        if isTrain == True:
            #Step2: Neutralize returns to avoid data imbalance
            df['NextRets'] = df['NextRets'] - np.mean(df['NextRets'])
   
        seq_volumes = list()
        seq_closes = list()
        returns = list()
        volumes = list()
        closes = list()
        highest_volumes = list()
        lowest_volumes = list()
        highest_closes = list()
        lowest_closes = list()
        dates = list()
        for i in range(W-1,len(df)-forwardperiod, forwardperiod):
            seq_closes.append(np.array(df['Adj Close'].iloc[i-W+1:i+1]))
            seq_volumes.append(np.array(df['Volume'].iloc[i-W+1:i+1]))
            returns.append(np.array(np.sum([df['NextRets'].iloc[f] for f in range(i,i+forwardperiod)]))) 
            highest_closes.append(np.max(seq_closes[-1]))
            lowest_closes.append(np.min(seq_closes[-1]))
            highest_volumes.append(np.max(seq_volumes[-1]))
            lowest_volumes.append(np.min(seq_volumes[-1]))
            dates.append(df['Date'].iloc[i])

        H1 = W//2-1

        str_ticker = ''
        str_ticker_Y = ''
        for i in range(len(seq_closes)):
            #Close image generation 15*32
            image_close = np.zeros((H1,W)).astype(np.int64)
            highest_close = highest_closes[i]
            lowest_close = lowest_closes[i]
            spread = (highest_close - lowest_close)/H1
            seq_closes_i = seq_closes[i]
            for j in range(H1):
                for k in range(len(seq_closes_i)):
                    close_k = seq_closes_i[k]
                    if lowest_close+j*spread<=close_k<lowest_close+(j+1)*spread:
                        image_close[H1-j-1,k] = 1
                    if close_k==lowest_close+H1*spread:
                        image_close[0,k] = 1

            #Volume image generation 15*32
            image_volume = np.zeros((H1,W)).astype(np.int64)
            highest_volume = highest_volumes[i]
            lowest_volume = lowest_volumes[i]
            spread = (highest_volume - lowest_volume)/H1
            seq_volumes_i = seq_volumes[i]
            for j in range(H1):
                for k in range(len(seq_volumes_i)):
                    volume_k = seq_volumes_i[k]
                    if lowest_volume+j*spread<=volume_k<lowest_volume+(j+1)*spread:
                        image_volume[H1-j-1,k] = 1
                    if volume_k==lowest_volume+H1*spread:
                        image_volume[0,k] = 1

            #Whole image
            int_image = np.zeros((2,W)).astype(np.int64)
            image = np.concatenate((image_close, int_image, image_volume), axis=0)
            image = image.astype(str)

            #Join
            for term in image:
                str_ticker+=' '.join(term) + ' \n'
            if i == len(seq_closes) - 1:
                str_ticker+= 'F\n'
            else:
                str_ticker+= 'E\n'
            str_ticker_Y+= str(np.round(float(returns[i]),4)) + '\n'

        print('Ticker {} completed in {} s'.format(ticker.replace('.csv',''), time.time()-time1))
        str_total+=str_ticker
        str_total_Y+=str_ticker_Y
        str_ticker = ''
        str_ticker_Y = ''
    
    string = ''
    for company in companies_bundle:
        string+='_'+str(company)
        
    with open(os.path.join(dir_path,'inputX'+string+'.txt'), 'w') as inputXfile:
        inputXfile.write(str_total)
    inputXfile.close()

    with open(os.path.join(dir_path,'inputY'+string+'.txt'), 'w') as inputYfile:
        inputYfile.write(str_total_Y)                      
    inputYfile.close()

In [None]:
#Training
W=16
forwardperiod = 4
country_ticker = 'US'
base_path = r'YourSP500DataPath'
path = os.path.join(base_path,country_ticker)
concatdates = '1999_2016' #Could be either 1999_2016 or 2000_2016. Need to check
dir_path = os.path.join(path,'Train','W_'+concatdates)

for companies_bundle in true_companies_bundles:
    dir_path = os.path.join(path,'Train','W_'+concatdates)

    #Create images
    create_image_files(dir_path, W, forwardperiod, companies_bundle, isTrain=True)

In [None]:
#Testing
W=16
forwardperiod = 4
country_ticker = 'US'
base_path = r'YourSP500DataPath'
path = os.path.join(base_path,country_ticker)
concatdates = '2016_2020'
dir_path = os.path.join(path,'Test','W_'+concatdates)

for companies_bundle in true_companies_bundles:
    dir_path = os.path.join(path,'Test','W_'+concatdates)

    #Create images
    create_image_files(dir_path, W, forwardperiod, companies_bundle, isTrain=False)