In [49]:
import os
# Set & view the working directory to ensure runtime is executng in expected directory
os.chdir("C:/Users/angusf/source/repos/synth_nonin")
cwd = os.getcwd()
print(f"Current working directory: {cwd}")


Current working directory: C:\Users\angusf\source\repos\synth_nonin


### Prerequistes
- assumes that a training data set consisting of multiple sequeneces of stock data exists in the working folder 
- assumes that the training data contains the at least the following rows
- `Symbol`: contains the stock symbol
- `Date`: (with a specific format as specificed in the regex of the metadata setting block below) 
- `Time`: (with a specific format as specificed in the regex of the metadata setting block below)
- `Combineddatetime`: (with a specific format as specificed in the regex of the metadata setting block below)

In [50]:
import pandas

file_path = './master_concatenated_stock_input_data.csv'
training_data_with_sequences = pandas.read_csv(file_path)
training_data_with_sequences['Combineddatetime']= pandas.to_datetime(training_data_with_sequences['Combineddatetime'])
print(training_data_with_sequences.head(3))
print(f"Number of rows in data set: {training_data_with_sequences.shape[0]}") # use shape[0] to get number of rows of training data
training_data_with_sequences['Symbol'].unique() # list the unique symbols in the master data set


         Date      Time     Open   High    Low  Close  Volume Symbol  \
0  2017-11-17  15:35:00  68.5300  68.72  68.38  68.67   79411      a   
1  2017-11-17  15:40:00  68.5956  68.69  68.56  68.59   10014      a   
2  2017-11-17  15:45:00  68.5700  68.67  68.51  68.62   14182      a   

     Combineddatetime  
0 2017-11-17 15:35:00  
1 2017-11-17 15:40:00  
2 2017-11-17 15:45:00  
Number of rows in data set: 193284


array(['a', 'aan', 'aav', 'abb', 'aeb', 'aed', 'aee', 'aeg', 'aegn',
       'aeh', 'aehr', 'aeis', 'aek', 'ael', 'aem', 'aemd', 'aeo', 'aep',
       'aer', 'aeri', 'aes', 'aet', 'aeti', 'aeua', 'aey', 'aezs', 'afam',
       'afb', 'afc', 'afg', 'afge', 'afgh', 'afh', 'afhbl', 'afi', 'afl',
       'afmd', 'afsd', 'afsi', 'afsi_a', 'afsi_b', 'afsi_c', 'afsi_d',
       'afsi_e', 'afsi_f', 'afss', 'afst', 'aft', 'afty', 'ag', 'agc',
       'agco', 'agd', 'agen', 'agfs', 'agfsw', 'agge', 'aggp', 'aggy',
       'agi', 'agii', 'agiil', 'agio', 'agle', 'agm-a', 'agm', 'agm_a',
       'agm_b', 'agm_c', 'agn', 'agnc', 'agncb', 'agncn', 'agn_a', 'ago',
       'ago_b', 'ago_e', 'ago_f', 'agr', 'agro', 'agrx', 'agt', 'agtc',
       'agu', 'agx', 'agys', 'ahc', 'ahgp', 'ahh', 'ahl', 'ahl_c',
       'ahl_d', 'ahp', 'ahpa', 'ahpau', 'ahpaw', 'ahpi', 'ahp_b', 'aht',
       'aht_d', 'aht_f', 'aht_g', 'aht_h', 'aht_i', 'ai', 'aic', 'aieq',
       'aif', 'aig-ws', 'aig', 'aimc', 'aimt', 'ain', 'ainc', 'ai

In [51]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata() # for a sequence we can start by auto detecting the meatadata. 
metadata.detect_from_dataframe(data=training_data_with_sequences) 
metadata # inspect the autodected metadata pre updates

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Date": {
            "sdtype": "categorical"
        },
        "Time": {
            "sdtype": "categorical"
        },
        "Open": {
            "sdtype": "numerical"
        },
        "High": {
            "sdtype": "numerical"
        },
        "Low": {
            "sdtype": "numerical"
        },
        "Close": {
            "sdtype": "numerical"
        },
        "Volume": {
            "sdtype": "numerical"
        },
        "Symbol": {
            "sdtype": "categorical"
        },
        "Combineddatetime": {
            "sdtype": "datetime"
        }
    }
}

In [52]:
#Add constraints to the market value related columns to ensure they are understood to be positive values
# Note that at present the PAR Sythesiser is not able to handle constraints, so this is listed here for future proofing or using in non sequence data
intraday_stock_value_constraints = {
'constraint_class': 'Positive',
    'constraint_parameters': {
        'column_name': 'Open',
        'column_name': 'Close',
        'column_name': 'High',
        'column_name': 'Low',
        'strict': True #enforce that the value is positive on the above named columns
    }
}

In [53]:
# tell SDV that the Symbol column is an ID column etc
#note that we are comparing with the data types in the documented stock cymbol example https://colab.research.google.com/drive/1cT4-jFK2Bxe93QudC_CwHq_yVCcNcxal?usp=sharing#scrollTo=yfOLibFfR9JF
metadata.update_column(column_name='Symbol', sdtype='id',regex_format='[a-zA-Z]{4}')
metadata.update_column(column_name='Date', sdtype='datetime',datetime_format='%Y-%m-%d')
metadata.update_column(column_name='Time', sdtype='datetime',datetime_format='%H:%M:%S')
metadata.update_column(column_name='Combineddatetime', sdtype='datetime',datetime_format='%Y-%m-%d %H:%M:%S')
metadata.update_column(column_name='Open', sdtype='numerical', computer_representation='Float')
metadata.update_column(column_name='Close', sdtype='numerical', computer_representation='Float')
metadata.update_column(column_name='Low', sdtype='numerical', computer_representation='Float')
metadata.update_column(column_name='High', sdtype='numerical', computer_representation='Float')
metadata.update_column(column_name='Volume', sdtype='numerical', computer_representation='Float')
metadata.set_sequence_key(column_name='Symbol') # tell SDV that the Symbol column is the sequence key upon which sequences repeat
metadata.set_sequence_index('Combineddatetime') #The SDV formal documentation shows sytax of `metadata.set_sequence_key(sequence_index='Time')` which causes a error
# setting the sequence index to the "Combineddatetime" column does not produce time stamps in the series that are precisely following the orignal data set increments . So post work likely needed to fix time series date and time values
metadata #inspect the updated metadata


{
    "sequence_key": "Symbol",
    "sequence_index": "Combineddatetime",
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Date": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d"
        },
        "Time": {
            "sdtype": "datetime",
            "datetime_format": "%H:%M:%S"
        },
        "Open": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "High": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "Low": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "Close": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "Volume": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "Symbol": {
            "sdtype": "id",
            "regex_format": "[a-zA-Z]{4}"
    

In [54]:
from sdv.sequential import PARSynthesizer

synthesizer = PARSynthesizer(
    metadata,
    context_columns=[], # update the columns that list the context columns, e.g. those which wont change between sequence/ #date should probably be a context column
    enforce_min_max_values=False, #dont enforce min/max values on the columns as this will restruct the results (unless we want similar ranges)                             
    enforce_rounding=True, #round the values to the same specificity as the source data 
    epochs = 80, # the number of epochs to run the model for
    verbose = True, #Print out the loss value per epoch. #The loss values indicate how well the neural network is currently performing, lower values indicating higher quality.                
    cuda = False #use the GPU if available
)

#add the non-zero stock value constraints built earlier to the synthesizer (noting these are currently not supported in the PAR model)
synthesizer.add_constraints(constraints=[intraday_stock_value_constraints])

synthesizer.fit(training_data_with_sequences)

Epoch 80 | Loss 1.3906798362731934: 100%|██████████| 80/80 [37:09<00:00, 27.87s/it]


In [55]:
#Save the trained synthesiser to disk for later use without the need to retrain
synthesizer.save('IntraDayStockSynthesiser.pkl')
