From 136146c17a6a10cbfd2f73262bdf47c2b9c8190d Mon Sep 17 00:00:00 2001 From: Iph Date: Thu, 5 Jul 2012 23:20:36 -0700 Subject: [PATCH] Changed main folder to lib. Made an executable and added a setup.py for easy install. --- README.md | 60 +++++++++---- boto_example.config | 3 +- emrio | 5 ++ {emrio => emrio_lib}/EMRio.py | 12 +-- {emrio => emrio_lib}/__init__.py | 0 {emrio => emrio_lib}/config.py | 0 {emrio => emrio_lib}/ec2/__init__.py | 0 {emrio => emrio_lib}/ec2/east_coast_prices.py | 0 {emrio => emrio_lib}/ec2/west_coast_prices.py | 0 {emrio => emrio_lib}/ec2_cost.py | 88 +++++++++---------- {emrio => emrio_lib}/graph_jobs.py | 3 +- {emrio => emrio_lib}/job_handler.py | 0 {emrio => emrio_lib}/optimizer.py | 6 +- {emrio => emrio_lib}/simulate_jobs.py | 0 setup.py | 37 ++++++++ tests/test_ec2_cost.py | 2 +- tests/test_emrio.py | 4 +- tests/test_instance_predictor.py | 4 +- tests/test_job_handler.py | 6 +- tests/test_optimize.py | 4 +- 20 files changed, 154 insertions(+), 80 deletions(-) create mode 100644 emrio rename {emrio => emrio_lib}/EMRio.py (97%) rename {emrio => emrio_lib}/__init__.py (100%) rename {emrio => emrio_lib}/config.py (100%) rename {emrio => emrio_lib}/ec2/__init__.py (100%) rename {emrio => emrio_lib}/ec2/east_coast_prices.py (100%) rename {emrio => emrio_lib}/ec2/west_coast_prices.py (100%) rename {emrio => emrio_lib}/ec2_cost.py (81%) rename {emrio => emrio_lib}/graph_jobs.py (97%) rename {emrio => emrio_lib}/job_handler.py (100%) rename {emrio => emrio_lib}/optimizer.py (96%) rename {emrio => emrio_lib}/simulate_jobs.py (100%) create mode 100644 setup.py diff --git a/README.md b/README.md index bd3ca10..8184908 100644 --- a/README.md +++ b/README.md @@ -3,47 +3,77 @@ EMRio Elastic MapReduce instance optimizer +EMRio helps you save money on Elastic MapReduce by using your last two +months of usage to estimate how many EC2 reserved instances you should buy +for the next year. + Introduction ------------ -Elastic MapReduce is a service provided by Amazon that makes it easy to use MapReduce. EMR run on machines called EC2 instances. They come in many different flavors from heavy memory usage to heavy CPU usage. When businesses start using EMR, they use these services as a pay-as-you-go service. After some time, the amount of instances you use can become stable. If you utilize enough instances over time, it might make sense to switch from the pay-as-you-go service, or On-Demand service, to a pay-upfront service, or Reserved Instances service. +Elastic MapReduce is a service provided by Amazon that makes it easy to use +MapReduce. EMR run on machines called EC2 instances. They come in many +different flavors from heavy memory usage to heavy CPU usage. When businesses +start using EMR, they use these services as a pay-as-you-go service. After +some time, the amount of instances you use can become stable. If you utilize +enough instances over time, it might make sense to switch from the pay-as-you +-go service, or On-Demand service, to a pay-upfront service, or Reserved +Instances service. + +How Reserved Instances work can be read +[here](http://aws.amazon.com/ec2/reserved-instances/). If you think that +switching to reserved instances is a good plan, but don't know how many to +buy, that's what EMRio is for! -How Reserved Instances work can be read [here](http://aws.amazon.com/ec2/reserved-instances/). If you think that switching to reserved instances is a good plan, but don't know how many to buy, that's what EMRio is for! How It Works ------------ -EMRio first looks at your EMR history. That data has a two month limit. It then acts as if the job flow was reoccurring for a year. It has to estimate a year's worth of data for Reserved Instances to be worth the cost. It then simulates different configurations using the job flow history and will produce the best pool of instances to buy. +EMRio first looks at your EMR history. That data has a two month limit. It +then acts as if the job flow was reoccurring for a year. It has to estimate +a year's worth of data for Reserved Instances to be worth the cost. It then +simulates different configurations using the job flow history and will +produce the best pool of instances to buy. Dependencies ------------ - -boto - -tzinfo - -matplotlib +*boto +*tzinfo +*matplotlib How to Run EMRio ---------------- -Once you have the dependencies installed, you need to set up your boto configuration file. Look at our boto config as an example. Once you fill in the AWS key information and region information, copy it to either /etc/boto.conf or ~/.boto +Once you have the dependencies installed, you need to set up your boto +configuration file. Look at our boto config as an example. Once you fill in +the AWS key information and region information, copy it to either /etc/boto. +conf or ~/.boto -After that is setup, cd into emrio and run: +After that is setup, `cd` into `emrio` and run: python EMRio.py -This should take a minute or two to grab the information off S3, do a few simulations, and output the resultant optimized instance pool. +This should take a minute or two to grab the information off S3, do a few +simulations, and output the resultant optimized instance pool. -If you want to see instance usage over time (how many instances are running at the same time), you run:: +If you want to see instance usage over time (how many instances are running +at the same time), you run:: python EMRio.py --graph cost -After it calculates the same data, you will now see graphs of each instance-type's usage over time, like this:: +After it calculates the same data, you will now see graphs of each instance- +type's usage over time, like this:: IMAGE HERE -Now, re-calculating the optimal instances is kind of pointless on the same data, so in order to save and load optimal instance configurations, use this: +Now, re-calculating the optimal instances is kind of pointless on the same +data, so in order to save and load optimal instance configurations, use this: - python EMRio.py --save-optimized=output.txt + python EMRio.py --cache=output.txt -If you want to see how this is formatted, check out the tests folder where an example instance file can be found. +If you want to see how this is formatted, check out the tests folder where +an example instance file can be found. Which will save the results in output.txt, and load them like so: python EMRio.py --optimized=output.txt -If you want to see all the commands, try --help. +If you want to see all the commands, try `--help`. + + python EMRio.py --help + diff --git a/boto_example.config b/boto_example.config index 58f8218..29ffc7d 100644 --- a/boto_example.config +++ b/boto_example.config @@ -8,9 +8,8 @@ emr_region_name = us-west-1 emr_region_endpoint = us-west-1.elasticmapreduce.amazonaws.com ec2_region_endpoint = us-west-1.ec2.amazonaws.com -## Here are some examples of other regions tat you can use other than us-west-1 +## Here are some examples of other regions that you can use other than us-west-1 ## The list of possible regions are currently (June 28th 2012): -## us-east-1 (US EAST) ## us-west-1 (US WEST NORTH CALFORNIA) ## us-west-2 (US WEST OREGON) ## eu-west-1 (EU IRELAND) diff --git a/emrio b/emrio new file mode 100644 index 0000000..ffbef9c --- /dev/null +++ b/emrio @@ -0,0 +1,5 @@ +#!/usr/bin/python +import emrio_lib +import sys +if __name__ == '__main__': + emrio_lib.EMRio.main(sys.argv[1:]) diff --git a/emrio/EMRio.py b/emrio_lib/EMRio.py similarity index 97% rename from emrio/EMRio.py rename to emrio_lib/EMRio.py index 7549d38..effdec6 100644 --- a/emrio/EMRio.py +++ b/emrio_lib/EMRio.py @@ -16,6 +16,7 @@ import boto from config import EC2 +from ec2_cost import instance_types_in_pool from graph_jobs import instance_usage_graph from graph_jobs import total_hours_graph from job_handler import get_job_flows, load_job_flows_from_amazon @@ -55,9 +56,8 @@ def main(args): def make_option_parser(): - usage = '%prog [options]' description = 'Print a giant report on EMR usage.' - option_parser = OptionParser(usage=usage, description=description) + option_parser = OptionParser(description=description) option_parser.add_option( '-v', '--verbose', dest='verbose', default=False, action='store_true', help='print more messages to stderr') @@ -84,7 +84,7 @@ def make_option_parser(): 'starts before this day, it is discarded (e.g.: --max-days 2012/05/07)') ) option_parser.add_option( - '-f', '--file', dest='file_inputs', type='string', default=None, + '--file', dest='file_inputs', type='string', default=None, help="Input a file that has job flows JSON encoded. The format is 1 job" "per line or comma separated jobs." ) @@ -93,7 +93,7 @@ def make_option_parser(): help=("Uses a previously saved optimized pool instead of calculating it from" " the job flows")) option_parser.add_option( - '--save_optimized', dest='save', type='string', default=None, + '--cache', dest='save', type='string', default=None, help='Save the optimized results so you dont calculate them multiple times') option_parser.add_option( '-g', '--graph', dest='graph', type='string', default='None', @@ -298,8 +298,8 @@ def output_statistics(log, pool, demand_log,): owned_reserved_instances = get_owned_reserved_instances() buy_instances = calculate_instances_to_buy(owned_reserved_instances, pool) - all_instances = EC2.instance_types_in_pool(pool) - all_instances.union(EC2.instance_types_in_pool(owned_reserved_instances)) + all_instances = instance_types_in_pool(pool) + all_instances.union(instance_types_in_pool(owned_reserved_instances)) print "%20s %15s %15s %15s" % ('', 'Optimal', 'Owned', 'To Purchase') for utilization_class in EC2.RESERVE_PRIORITIES: diff --git a/emrio/__init__.py b/emrio_lib/__init__.py similarity index 100% rename from emrio/__init__.py rename to emrio_lib/__init__.py diff --git a/emrio/config.py b/emrio_lib/config.py similarity index 100% rename from emrio/config.py rename to emrio_lib/config.py diff --git a/emrio/ec2/__init__.py b/emrio_lib/ec2/__init__.py similarity index 100% rename from emrio/ec2/__init__.py rename to emrio_lib/ec2/__init__.py diff --git a/emrio/ec2/east_coast_prices.py b/emrio_lib/ec2/east_coast_prices.py similarity index 100% rename from emrio/ec2/east_coast_prices.py rename to emrio_lib/ec2/east_coast_prices.py diff --git a/emrio/ec2/west_coast_prices.py b/emrio_lib/ec2/west_coast_prices.py similarity index 100% rename from emrio/ec2/west_coast_prices.py rename to emrio_lib/ec2/west_coast_prices.py diff --git a/emrio/ec2_cost.py b/emrio_lib/ec2_cost.py similarity index 81% rename from emrio/ec2_cost.py rename to emrio_lib/ec2_cost.py index 5c8182b..670b524 100644 --- a/emrio/ec2_cost.py +++ b/emrio_lib/ec2_cost.py @@ -21,6 +21,7 @@ import copy from collections import defaultdict + class EC2Info(object): """This class is used to store EC2 info like costs from the config file. All the functions in it use that config to build pools or @@ -140,23 +141,6 @@ def init_reserve_costs(self, init_value): reserve_costs[utilization_class] = init_value return reserve_costs - @staticmethod - def instance_types_in_pool(pool): - """Gets the set of all instance types in - a pool or log - - Args: - pool: Instances currently owned for each utilization_classization type. - - Returns: - A set of all the instances used for all utilization_classization types. - """ - instance_types = set() - for utilization_class in pool: - for instance_type in pool[utilization_class]: - instance_types.add(instance_type) - return instance_types - def is_reserve_type(self, instance_type): """This just returns if a utilization_classization type is a reserve instance. If not, it is probably DEMAND type. @@ -196,33 +180,49 @@ def color_scheme(self): green = int(green + increment) return colors - @staticmethod - def fill_instance_types(job_flows, pool): - """Use this function to fill the instance pool - with all the instance types used in the job flows. - - example: if the job_flows has m1.small, and m1.large - and we had 2 utils of LIGHT_UTIL and HEAVY_UTIL, the - resultant pool from the function will be: - - pool = { - LIGHT_UTIL: { - 'm1.small': 0, 'm1.large': 0 - } - HEAVY_UTIL: { - 'm1.small': 0, 'm1.large': 0 - } + +def fill_instance_types(job_flows, pool): + """Use this function to fill the instance pool + with all the instance types used in the job flows. + + example: if the job_flows has m1.small, and m1.large + and we had 2 utils of LIGHT_UTIL and HEAVY_UTIL, the + resultant pool from the function will be: + + pool = { + LIGHT_UTIL: { + 'm1.small': 0, 'm1.large': 0 } - Args: - pool: A dict of utilization level dictionaries with nothing in them. + HEAVY_UTIL: { + 'm1.small': 0, 'm1.large': 0 + } + } + Args: + pool: A dict of utilization level dictionaries with nothing in them. - Mutates: - pool: for each utilization type, it fills in all the instance_types - that any job uses. - """ - for job in job_flows: - for instance in job.get('instancegroups'): - instance_type = instance.get('instancetype') - for utilization_class in pool.keys(): - pool[utilization_class][instance_type] = pool[utilization_class][instance_type] + Mutates: + pool: for each utilization type, it fills in all the instance_types + that any job uses. + """ + for job in job_flows: + for instance in job.get('instancegroups'): + instance_type = instance.get('instancetype') + for utilization_class in pool.keys(): + pool[utilization_class][instance_type] = pool[utilization_class][instance_type] + + +def instance_types_in_pool(pool): + """Gets the set of all instance types in + a pool or log + Args: + pool: Instances currently owned for each utilization_classization type. + + Returns: + A set of all the instances used for all utilization_classization types. + """ + instance_types = set() + for utilization_class in pool: + for instance_type in pool[utilization_class]: + instance_types.add(instance_type) + return instance_types diff --git a/emrio/graph_jobs.py b/emrio_lib/graph_jobs.py similarity index 97% rename from emrio/graph_jobs.py rename to emrio_lib/graph_jobs.py index 4d3344e..1ccc162 100644 --- a/emrio/graph_jobs.py +++ b/emrio_lib/graph_jobs.py @@ -10,6 +10,7 @@ import matplotlib.pyplot as plt from config import EC2 +from ec2_cost import instance_types_in_pool from simulate_jobs import Simulator, SimulationObserver COLORS = EC2.color_scheme() @@ -73,7 +74,7 @@ def graph_over_time(info_over_time, if end_time.hour != 0: end_time = end_time.replace(hour=0, day=(end_time.day + 1)) - for instance_type in EC2.instance_types_in_pool(info_over_time): + for instance_type in instance_types_in_pool(info_over_time): # Locators / Formatters to pretty up the graph. hours = mdates.HourLocator(byhour=None, interval=1) days = mdates.DayLocator(bymonthday=None, interval=1) diff --git a/emrio/job_handler.py b/emrio_lib/job_handler.py similarity index 100% rename from emrio/job_handler.py rename to emrio_lib/job_handler.py diff --git a/emrio/optimizer.py b/emrio_lib/optimizer.py similarity index 96% rename from emrio/optimizer.py rename to emrio_lib/optimizer.py index 1b0fac4..3f197f9 100644 --- a/emrio/optimizer.py +++ b/emrio_lib/optimizer.py @@ -5,6 +5,8 @@ import logging from math import ceil +from ec2_cost import instance_types_in_pool +from ec2_cost import fill_instance_types from simulate_jobs import Simulator @@ -32,8 +34,8 @@ def run(self, pre_existing_pool=None): # Zero-ing the instances just makes it so the optimized pool # knows all the instance_types the job flows use beforehand. - self.EC2.fill_instance_types(self.job_flows, optimized_pool) - for instance in self.EC2.instance_types_in_pool(optimized_pool): + fill_instance_types(self.job_flows, optimized_pool) + for instance in instance_types_in_pool(optimized_pool): logging.debug("Finding optimal instances for %s", instance) self.optimize_reserve_pool(instance, optimized_pool) return optimized_pool diff --git a/emrio/simulate_jobs.py b/emrio_lib/simulate_jobs.py similarity index 100% rename from emrio/simulate_jobs.py rename to emrio_lib/simulate_jobs.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..faa94e6 --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +import os +from setuptools import setup +setuptools_kwargs = { + 'install_requires': [ + 'boto>=2.2.0', + 'PyYAML', + 'simplejson>=2.0.9', + ], + 'provides': ['emrio'], + 'tests_require': ['unittest2'], + } + + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +setup( + name="emrio", + version="0.0.1", + author="Sean Myers", + author_email="SeanMyers0608@gmail.com", + description=("EMR instance optimizer will take your past EMR history and" + "attempt to optimize the max reserved instances for it"), + license="Apache?", + keywords="EMRio EMR Instance Optimizer Reserved Instances", + url="http://github.com/Yelp/EMRio", + packages=['emrio_lib', 'tests'], + long_description=read('README.md'), + classifiers=[ + "Development Status :: 3 - Alpha", + "Topic :: Utilities", + ], +) diff --git a/tests/test_ec2_cost.py b/tests/test_ec2_cost.py index 51d31fb..2c41fc7 100644 --- a/tests/test_ec2_cost.py +++ b/tests/test_ec2_cost.py @@ -5,7 +5,7 @@ import unittest from collections import defaultdict -from emrio.ec2_cost import EC2Info +from emrio_lib.ec2_cost import EC2Info from test_prices import HEAVY_UTIL, MEDIUM_UTIL, LIGHT_UTIL, DEMAND from test_prices import COST, RESERVE_PRIORITIES diff --git a/tests/test_emrio.py b/tests/test_emrio.py index fedf2a8..e114d2b 100644 --- a/tests/test_emrio.py +++ b/tests/test_emrio.py @@ -1,7 +1,7 @@ """Tests for the main EMRio module are here.""" import unittest -from emrio.ec2_cost import EC2Info -from emrio.EMRio import read_optimal_instances +from emrio_lib.ec2_cost import EC2Info +from emrio_lib.EMRio import read_optimal_instances from test_prices import COST, RESERVE_PRIORITIES EC2 = EC2Info(COST, RESERVE_PRIORITIES) diff --git a/tests/test_instance_predictor.py b/tests/test_instance_predictor.py index 9a31d7e..35b37f1 100644 --- a/tests/test_instance_predictor.py +++ b/tests/test_instance_predictor.py @@ -3,8 +3,8 @@ import datetime from unittest import TestCase -from emrio.ec2_cost import EC2Info -from emrio.simulate_jobs import Simulator +from emrio_lib.ec2_cost import EC2Info +from emrio_lib.simulate_jobs import Simulator from test_prices import COST, HEAVY_UTIL, MEDIUM_UTIL, LIGHT_UTIL, RESERVE_PRIORITIES from test_prices import DEMAND diff --git a/tests/test_job_handler.py b/tests/test_job_handler.py index 44acc12..7dab64e 100644 --- a/tests/test_job_handler.py +++ b/tests/test_job_handler.py @@ -4,10 +4,10 @@ import pytz # Setup a mock EC2 since west coast can be changed in the future. -from emrio.job_handler import no_date_filter, range_date_filter -from emrio.ec2_cost import EC2Info +from emrio_lib.job_handler import no_date_filter, range_date_filter +from emrio_lib.ec2_cost import EC2Info from test_prices import COST, RESERVE_PRIORITIES -from emrio.config import TIMEZONE +from emrio_lib.config import TIMEZONE EC2 = EC2Info(COST, RESERVE_PRIORITIES) diff --git a/tests/test_optimize.py b/tests/test_optimize.py index 1830c9c..5e09a76 100644 --- a/tests/test_optimize.py +++ b/tests/test_optimize.py @@ -4,8 +4,8 @@ import copy from math import ceil -from emrio.optimizer import Optimizer, convert_to_yearly_estimated_hours -from emrio import ec2_cost +from emrio_lib.optimizer import Optimizer, convert_to_yearly_estimated_hours +from emrio_lib import ec2_cost from test_prices import * EC2 = ec2_cost.EC2Info(COST, RESERVE_PRIORITIES)