From 2a2f43fb735561d1bd9d9864c427935d305ae389 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Wed, 11 Jun 2014 17:12:34 +0530 Subject: [PATCH 1/2] [SPARK-2014] Make PySpark store RDDs in MEMORY_ONLY_SER with compression by default --- python/pyspark/conf.py | 6 ++++++ python/pyspark/context.py | 2 +- python/pyspark/rdd.py | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index 8eff4a242a529..17d2c08af7fd8 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -99,6 +99,12 @@ def set(self, key, value): self._jconf.set(key, unicode(value)) return self + def setIfMissing(self, key, value): + """Set a configuration property, if not already set.""" + if self.get(key) == None: + self.set(key, value) + return self + def setMaster(self, value): """Set master URL to connect to.""" self._jconf.setMaster(value) diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 062bec2381a8f..29d37fc15483a 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -102,7 +102,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) - + self._conf.setIfMissing("spark.rdd.compress", "true") # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 9c69c79236edc..b9414b2ca5c20 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -194,10 +194,10 @@ def context(self): def cache(self): """ - Persist this RDD with the default storage level (C{MEMORY_ONLY}). + Persist this RDD with the default storage level (C{MEMORY_ONLY_SER}). """ self.is_cached = True - self._jrdd.cache() + self.persist(StorageLevel.MEMORY_ONLY_SER) return self def persist(self, storageLevel): From f192df718b1f2cd05f03fbe0d5ca63b38ad144ce Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Thu, 24 Jul 2014 13:05:45 +0530 Subject: [PATCH 2/2] Code Review --- python/pyspark/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index 17d2c08af7fd8..ea20c49c114da 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -101,7 +101,7 @@ def set(self, key, value): def setIfMissing(self, key, value): """Set a configuration property, if not already set.""" - if self.get(key) == None: + if self.get(key) is None: self.set(key, value) return self