Permalink
Browse files

Merge branch 'development' of github.com:Yelp/mrjob into development

  • Loading branch information...
2 parents e487941 + da354a9 commit c247231085fdc462939762ffa9a839587495a717 David Marin committed Oct 21, 2011
Showing with 5 additions and 88 deletions.
  1. +0 −84 mrjob.conf.example
  2. +1 −2 mrjob/examples/mr_word_freq_count.py
  3. +4 −2 mrjob/local.py
View
@@ -1,84 +0,0 @@
-# This is basically the config file we use in production at Yelp, with some
-# strategic edits. ;)
-#
-# If you don't have the yaml module installed, you'll have to use JSON instead,
-# which would look something like this:
-#
-# {"runners": {
-# "emr": {
-# "aws_access_key_id": "HADOOPHADOOPBOBADOOP",
-# "aws_region": "us-west-1",
-# "aws_secret_access_key": "MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP",
-# "base_tmp_dir": "/scratch/$USER"
-# "bootstrap_python_packages": [
-# "$BT/aws/python-packages/*.tar.gz"
-# ],
-# ...
-#
-runners:
- emr:
- aws_access_key_id: HADOOPHADOOPBOBADOOP
- # We run on in the west region because we're located on the west coast,
- # and there are no eventual consistency issues with newly created S3 keys.
- aws_region: us-west-1
- aws_secret_access_key: MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP
- # alternate tmp dir
- base_tmp_dir: /scratch/$USER
- # $BT is the path to our source tree. This lets us add modules to
- # install on EMR by simply dumping them in this dir.
- bootstrap_python_packages:
- - $BT/aws/python-packages/*.tar.gz
- # specifying an ssh key pair allows us to ssh tunnel to the job tracker
- # and fetch logs via ssh
- ec2_key_pair: EMR
- ec2_key_pair_file: $BT/config/EMR.pem
- # use beefier instances in production
- ec2_instance_type: c1.xlarge
- # but only use one unless overridden
- num_ec2_instances: 1
- # use our local time zone (this is important for deciding when
- # days start and end, for instance)
- cmdenv:
- TZ: America/Los_Angeles
- # we create the src-tree.tar.gz tarball with a Makefile. It only contains
- # a subset of our code
- python_archives: &python_archives
- - $BT/aws/src-tree.tar.gz
- # our bucket also lives in the us-west region
- s3_log_uri: s3://walrus/tmp/logs/
- s3_scratch_uri: s3://walrus/tmp/
- setup_cmds: &setup_cmds
- # these files are different between dev and production, so they're
- # uploaded separately. copying them into place isn't safe because
- # src-tree.tar.gz is actually shared between several mappers/reducers.
- # Another safe approach would be to add a rule to Makefile.emr that
- # copies these files if they haven't already been copied (setup_cmds
- # from two mappers/reducers won't run simultaneously on the same machine)
- - ln -sf $(readlink -f config.py) src-tree.tar.gz/config/config.py
- - ln -sf $(readlink -f secret.py) src-tree.tar.gz/config/secret.py
- # run Makefile.emr to compile C code (EMR has a different architecture,
- # so we can't just upload the .so files)
- - cd src-tree.tar.gz; make -f Makefile.emr
- # generally, we run jobs on a Linux server separate from our desktop
- # machine. So the SSH tunnel needs to be open so a browser on our
- # desktop machine can connect to it.
- ssh_tunnel_is_open: true
- ssh_tunnel_to_job_tracker: true
- # upload these particular files on the fly because they're different
- # between development and production
- upload_files: &upload_files
- - $BT/config/config.py
- - $BT/config/secret.py
- hadoop:
- # Note the use of YAML references to re-use parts of the EMR config.
- # We don't currently run our own hadoop cluster, so this section is
- # pretty boring.
- base_tmp_dir: /scratch/$USER
- python_archives: *python_archives
- setup_cmds: *setup_cmds
- upload_files: *upload_files
- local:
- # We don't have gcc installed in production, so if we have to run an
- # MRJob in local mode in production, don't run the Makefile
- # and whatnot; just fall back on the original copy of the code.
- base_tmp_dir: /scratch/$USER
@@ -25,8 +25,7 @@ def mapper(self, _, line):
yield (word.lower(), 1)
def combiner(self, word, counts):
- counts_list = list(counts)
- yield (word, sum(counts_list))
+ yield (word, sum(counts))
def reducer(self, word, counts):
yield (word, sum(counts))
View
@@ -40,6 +40,8 @@
log = logging.getLogger('mrjob.local')
+DEFAULT_MAP_TASKS = 2
+DEFAULT_REDUCE_TASKS = 2
class LocalMRJobRunner(MRJobRunner):
"""Runs an :py:class:`~mrjob.job.MRJob` locally, for testing
@@ -68,8 +70,8 @@ def __init__(self, **kwargs):
self._prev_outfiles = []
self._counters = []
- self._map_tasks = 1
- self._reduce_tasks = 1
+ self._map_tasks = DEFAULT_MAP_TASKS
+ self._reduce_tasks = DEFAULT_REDUCE_TASKS
self._running_env = defaultdict(str)

0 comments on commit c247231

Please sign in to comment.