In [1]:
import apache_beam as beam

In [2]:
from __future__ import absolute_import

import argparse
import logging
import re

from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


argv=None

parser = argparse.ArgumentParser()
parser.add_argument('--input',
                  dest='input',
                  default='gs://dataflow-samples/shakespeare/kinglear.txt',
                  help='Input file to process.')
parser.add_argument('--output',
                  dest='output',
                  # CHANGE 1/5: The Google Cloud Storage path is required
                  # for outputting the results.
                  default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                  help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
  # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
  # run your pipeline on the Google Cloud Dataflow Service.
  '--runner=DirectRunner',
  # CHANGE 3/5: Your project ID is required in order to run your pipeline on
  # the Google Cloud Dataflow Service.
  '--project=SET_YOUR_PROJECT_ID_HERE',
  # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
  # files.
  '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
  # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
  # files.
  '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
  '--job_name=your-wordcount-job',
])

# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True

In [3]:
pipeline = beam.Pipeline(options=pipeline_options)

In [4]:
path = '/Users/aaron/Documents/github/tfx/RELEASE.md'
lines = pipeline | beam.io.ReadFromText(path)

In [5]:
lines

<PCollection[ReadFromText/Read.None] at 0x114c9f6d0>

In [6]:
vars(lines)

{'element_type': Any,
 'pipeline': <apache_beam.pipeline.Pipeline at 0x114c84650>,
 'producer': AppliedPTransform(ReadFromText/Read, Read),
 'tag': None}

In [7]:
counts = (
        lines
        | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                      .with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

In [8]:
counts

<PCollection[GroupAndSum/Combine/ParDo(CombineValuesDoFn).None] at 0x114ceb2d0>

In [9]:
vars(counts)

{'element_type': Any,
 'pipeline': <apache_beam.pipeline.Pipeline at 0x114c84650>,
 'producer': AppliedPTransform(GroupAndSum/Combine/ParDo(CombineValuesDoFn), ParDo),
 'tag': None}

In [10]:
def format_result(word_count):
  (word, count) = word_count
  return '%s: %s' % (word, count)

output = counts | 'Format' >> beam.Map(format_result)

In [11]:
output

<PCollection[Format.None] at 0x114c96590>

In [12]:
known_args.output

'gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX'

In [13]:
output | WriteToText('/Users/aaron/Desktop/beam-tuesday-3.txt')

<PCollection[WriteToText/Write/WriteImpl/FinalizeWrite.None] at 0x114d0aa90>

In [14]:
pipeline.run()

<apache_beam.runners.portability.fn_api_runner.RunnerResult at 0x114f71590>

In [15]:
!ls /Users/aaron/Desktop/

[34mDownloaded Apps[m[m
[34mEurope Trip Oct 2016[m[m
[34mLoan closing docs[m[m
[34mOld Firefox Data[m[m
Screen Shot 2019-03-17 at 4.33.53 PM.png
Screen Shot 2019-03-17 at 4.33.57 PM.png
Screen Shot 2019-03-24 at 7.34.10 PM (2).png
Screen Shot 2019-03-24 at 7.34.10 PM.png
[34mScreen Shots[m[m
[34mServiceChannel[m[m
beam-tuesday-2.txt-00000-of-00001
beam-tuesday-3.txt-00000-of-00001
beam-tuesday.txt-00000-of-00001
counts-00000-of-00001
[34mjpeg[m[m
[34mlicense_plate_detection[m[m
[34mlicense_plate_detection_100[m[m
[34mmaskrcnn-benchmark-photos[m[m
[34mpng[m[m
[34mrtl8812AU_8821AU_linux[m[m


In [16]:
!head /Users/aaron/Desktop/beam-tuesday-2.txt-00000-of-00001

serving: 1
supported: 2
code: 1
looks: 1
still: 1
yet: 1
CSVExampleGen: 1
based: 1
shuffling: 1
notebooks: 1
