In [4]:
import heapq
import operator
import random
import sys
import warnings
from builtins import object
from builtins import zip
from past.builtins import long
from apache_beam.transforms import core
from apache_beam.transforms import cy_combiners
from apache_beam.transforms import ptransform
from apache_beam.transforms import window
from apache_beam.transforms.display import DisplayDataItem
from apache_beam.typehints import KV
from apache_beam.typehints import Any
from apache_beam.typehints import Dict
from apache_beam.typehints import Iterable
from apache_beam.typehints import List
from apache_beam.typehints import Tuple
from apache_beam.typehints import TypeVariable
from apache_beam.typehints import Union
from apache_beam.typehints import with_input_types
from apache_beam.typehints import with_output_types

In [None]:
class AggregateTrades(beam.PTransform):
  """Computes the longest session ending in each month."""
  def expand(self, pcoll):
    
    # get quantities
    quantities = (pcoll
        | '' >> beam.ParDo(FormatPubsubDoFn())
    )
    
    # get prices
    prices = (pcoll
        | 'FormatPubsub' >> beam.ParDo(FormatPubsubDoFn())
    )
    

In [6]:
# Type variables
K = TypeVariable('K')
V = TypeVariable('V')

Q = TypeVariable('Q')
P = TypeVariable('P')
S = TypeVariable('S')
T = TypeVariable('T')

@with_output_types(TupleDict[K, V])
class TradesCombineFn(core.CombineFn):
    
    def create_accumulator(self):
        return dict()

    def __init__(self, n, compare=None, key=None, reverse=False):
        pass

    def add_input(self, accumulator, element):
        quantity, price, side, time = element
        accumulator.append(element)
        return accumulator

    def process(self, element): 
        trades = element[1]
        
        count = len(trades)
        
        prcs = [t['price'] for t in trades]
        highPrice = max(prcs)
        lowPrice = min(prcs)
        meanPrice = sum(prcs)/count
        
        qtys = [t['quantity']*t['price'] for t in trades]
        volume = sum(qtys)
        minQty = min(qtys)
        maxQty = max(qtys)
        meanQty = sum(qtys)/count
        
        last = sorted(trades, key=itemgetter('event_time_ms'))[0]
        
        closePrice = last['price']
        closeQty = last['quantity']*closePrice
        
        vwap = ((closePrice+lowPrice+highPrice)/3)*volume
        
        npPrcs = np.array(prcs)
        npQtys = np.array(qtys)
        
        stdPrice = np.std(npPrcs)
        stdQty = np.std(npQtys)
        
        return [{
            'closePrice': closePrice,
            'highPrice': highPrice,
            'lowPrice': lowPrice,
            'meanPrice': meanPrice,
            'stdPrice': stdPrice,
            'closeQty': closeQty,
            'maxQty': maxQty,
            'minQty': minQty,
            'meanQty': meanQty,
            'stdQty': stdQty,
            'count': count,
            'vwap': vwap,
            'volume': volume
        }]

    def merge_accumulators(self, accumulators):
        result = dict()
        for a in accumulators:
          result.update(a)
        return result

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 70)

In [None]:
def run(argv=None):
    """Build and run the pipeline."""

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = False
    p = beam.Pipeline(options=pipeline_options)

    # ==================================================================>
    # Trades From Avro
    # ==================================================================>

    tradeRecords = \
        p | 'readTrades' >> ReadFromAvro(
            "../resources/trades/*.avro", 
            use_fastavro=True
        )
    
    # maintains orderly depth state and emits full levels for each 
    # depth update event recieved.
    enrichedDepths = (
        depthUpdateRecords
        | 'FormatPubsub' >> beam.ParDo(FormatPubsubDoFn())
        | 'DiscreteWindows' >> beam.WindowInto(window.FixedWindows((2*60)*100, 0))
        | 'AddSessionInfo' >> beam.ParDo(AddSessionInfoDoFn())
        | 'MapToWindowedPartitionIdKV' >> beam.ParDo(MapToWindowedPartitionIdKVDoFn())
        | 'FilterByPartition' >> beam.Filter(lambda kv: kv[0] == "okex_spot_depthUpdate_ETH_BTC")
        | 'BatchDepthUpdates' >> beam.ParDo(BatchDepthUpdatesDoFn())
#         | 'EnrichDepths' >> beam.ParDo(EnrichDepthsDoFn())
        | 'PrintOutputs' >> beam.ParDo(PrintDoFn())
    )
    
#     mergedEvents = ((enrichedDepths,tradeRecords) | 'MergePCollections' >> beam.Flatten())
    
#     windowed = (
#        enrichedDepths
#        | 'AddEventTimestamp' >> beam.Map(lambda e: beam.window.TimestampedValue(e, e["event_time_ms"]))
#        | 'DiscreteWindows' >> beam.WindowInto(window.FixedWindows((2*60)*1000, 0))  2min
#        | 'AddSessionInfo' >> beam.ParDo(AddSessionInfoDoFn())
#        | 'MapToWindowedPartitionIdKV' >> beam.ParDo(MapToWindowedPartitionIdKVDoFn())
#        | 'FilterByPartition' >> beam.Filter(lambda kv: kv[0] == "okex_spot_depthUpdate_ETH_BTC")
#      | 'ExtractDoFn' >> beam.ParDo(ExtractDoFn())   
#      | 'FilterDuplicates' >> FilterDuplicates()
#      )

#     tradeAggregation = (windowed 
# #          | 'FilterDepths' >> beam.ParDo(FilterTrades())

#          | 'MapToWindowedPartitionIdKV' >> beam.ParDo(MapToWindowedPartitionIdKVDoFn())
#          | 'GroupByWindowKey' >> GroupByKey()
#          | 'AggregateTrades' >> beam.ParDo(AggregateTradesDoFn())
#          | 'PrintOutputs' >> beam.ParDo(PrintDoFn())
#     )    
        
#     depthUpdateAggregation = (windowed 
# #        | 'FilterDepths' >> beam.ParDo(FilterDepths())
# #        | 'MapToWindowedPartitionIdKV' >> beam.ParDo(MapToWindowedPartitionIdKVDoFn())
# #        | 'GroupByWindowKey' >> GroupByKey()
#        | 'AggregateDepths' >> beam.ParDo(AggregateDepthsDoFn())
#        | 'PrintDepthOutputs' >> beam.ParDo(PrintDoFn())
#     )
    
#        | 'Merge trade and depth aggregations'
#        | 'JoinByWindow' >> beam.ParDo(FilterDuplicatesDoFn())
#        | 'AggregateFeatures' >> beam.ParDo(AggregateFeaturesDoFn())

    # ==================================================================>
    # Windowed Aggregations and Ingress
    # ==================================================================>
    
    #     filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
    #       './outputData.txt')

    # For each window 
    # --------------------------------->
    # combine 
    # window depth updates and trades
    # remove duplicates
    # update depth cache
    # aggregate trade events
    # aggregate depth events
    # join depth and trade aggregations by window 
    # emit full row
    # store row in avro with partitioned file layout

    result = p.run()
    result.wait_until_finish()