In [1]:
def calc_spread(df):
    '''
    Calculates the spread in seconds between the most recently pulled entry and the last pulled entry. 
    
    Provides a measure of lag time. 
    '''
    last_entry = dt.datetime.timestamp(df.index[-1])
    now = dt.datetime.timestamp(dt.datetime.now())
    spread = (now - last_entry) * 0.000001
    return spread

def add_log_ret(df):
    '''
    Add Log Returns to the dataframe. 
    '''
    df['Returns'] = np.log(df.close / df.close.shift(1))
    df.dropna(inplace=True)
    
def add_log_vol(df):
    df['vol_log'] = np.log(df.volume / df.volume.shift(1))
    df.dropna(inplace=True)
    
def add_dir(df):
    '''
    Add a boolean for the direction of Log Returns to the dataframe. 
    '''
    df['direction'] = np.sign(df['Returns']).astype(int)
    
def create_lags(data, iv, lags):
    '''
    Adds columns for lagged independent variables in preparation for modeling. 
    '''
    cols = []
    
    for lag in range(1, lags+1):
        col = '{}_lag_{}'.format(iv, lag)
        data[col] = data[iv].shift(lag)
        cols.append(col)
        
    return cols

def get_digitized_bins(data, iv, cols):
    '''
    We can get the indices of the bins to which each value belongs in an array by using numpy's digitize method.
    
    We have measurements of returns and we want to discretize them such that they are 0 or 1. 
    
    In this case we use the first and second moment of the historical log returns to digitize the features data,
    alloqing for more possible feature value combinations. 
    
    This provides a pronounced improvement to Support Vector Machines. 
    '''
    cols_bin = []
    
    mu = data[iv].mean()
    v = data[iv].std()
    bins = [mu-v, mu, mu+v]
    
    for col in cols:
        col_bin = col+'_bin'
        data[col_bin] = np.digitize(data[col], bins=bins)
        cols_bin.append(col_bin)
        
    return cols_bin

In [2]:
def process(df, instructions):
    '''
    This is to pre-process data for storage in the Database.
    '''
    #add_log_vol(df)
    add_log_ret(df)
    add_dir(df)
    
    cols_bins = []
    for iv in list(instructions["IVs"].keys()):
        if instructions["IVs"][iv][1] == "digitized":
            cols_bin = get_digitized_bins(df, iv, 
                                          create_lags(df, iv, instructions["IVs"][iv][0]))
            cols_bins += cols_bin
        else:
            cols_bin = create_lags(df, iv, instructions["IVs"][iv][0])
            cols_bins += cols_bin
    
    df.dropna(inplace=True)
    return df, cols_bins

In [3]:
class Process:
    '''
    This is to process updated data in a Temporary Database in order to integrate it 
    properly into the full Database.
    
    Database = Database class object. 
    df = Generator.update(init) DataFrame.
    n = lookback period.
    '''
    def __init__(self, Database, df, n):
        self.Db = Database
        self.df = df
        past = self.Db.dbPartial("Database", n)
        self.Db.dbReplace(past, "Temp")
        self.Db.dbUpdate(self.df, "Temp")
        self.preproc = self.Db.dbRead("Temp")
        
    def process(self, instructions):
        df, cols_bin = process(self.preproc, instructions)
        df = df[-1:]
        return df