MAPREDUCE-6622. Add capability to set JHS job cache to a task-based l…

…imit (rchiang via rkanter)
apache · Feb 27, 2016 · 0f72da7 · 0f72da7
1 parent d1d4e16
commit 0f72da7
Show file tree

Hide file tree

Showing 5 changed files with 381 additions and 58 deletions.
diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt
@@ -322,6 +322,9 @@ Release 2.9.0 - UNRELEASED
     MAPREDUCE-6640. mapred job -history command should be able to take
     Job ID (rkanter)
 
+    MAPREDUCE-6622. Add capability to set JHS job cache to a
+    task-based limit (rchiang via rkanter)
+
   OPTIMIZATIONS
 
   BUG FIXES

diff --git a/...-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JHAdminConfig.java b/...-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JHAdminConfig.java
@@ -98,15 +98,19 @@ public class JHAdminConfig {
   public static final String MR_HISTORY_JOBLIST_CACHE_SIZE =
     MR_HISTORY_PREFIX + "joblist.cache.size";
   public static final int DEFAULT_MR_HISTORY_JOBLIST_CACHE_SIZE = 20000;
-  
+
   /** The location of the Kerberos keytab file.*/
   public static final String MR_HISTORY_KEYTAB = MR_HISTORY_PREFIX + "keytab";
 
   /** Size of the loaded job cache.*/
   public static final String MR_HISTORY_LOADED_JOB_CACHE_SIZE = 
     MR_HISTORY_PREFIX + "loadedjobs.cache.size";
   public static final int DEFAULT_MR_HISTORY_LOADED_JOB_CACHE_SIZE = 5;
-
+
+  /** Size of the loaded job cache (in tasks).*/
+  public static final String MR_HISTORY_LOADED_TASKS_CACHE_SIZE =
+      MR_HISTORY_PREFIX + "loadedtasks.cache.size";
+
   /**
    * The maximum age of a job history file before it is deleted from the history
    * server.

diff --git a/...adoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml b/...adoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml
@@ -1656,7 +1656,32 @@
 <property>
   <name>mapreduce.jobhistory.loadedjobs.cache.size</name>
   <value>5</value>
-  <description>Size of the loaded job cache</description>
+  <description>Size of the loaded job cache.  This property is ignored if
+  the property mapreduce.jobhistory.loadedtasks.cache.size is set to a
+  positive value.
+  </description>
+</property>
+
+<property>
+  <name>mapreduce.jobhistory.loadedtasks.cache.size</name>
+  <value></value>
+  <description>Change the job history cache limit to be set in terms
+  of total task count.  If the total number of tasks loaded exceeds
+  this value, then the job cache will be shrunk down until it is
+  under this limit (minimum 1 job in cache).  If this value is empty
+  or nonpositive then the cache reverts to using the property
+  mapreduce.jobhistory.loadedjobs.cache.size as a job cache size.
+
+  Two recommendations for the mapreduce.jobhistory.loadedtasks.cache.size
+  property:
+  1) For every 100k of cache size, set the heap size of the Job History
+     Server to 1.2GB. For example,
+     mapreduce.jobhistory.loadedtasks.cache.size=500000, heap size=6GB.
+  2) Make sure that the cache size is larger than the number of tasks
+     required for the largest job run on the cluster. It might be a good
+     idea to set the value slightly higher (say, 20%) in order to allow
+     for job size growth.
+  </description>
 </property>
 
 <property>

diff --git a/...educe-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/CachedHistoryStorage.java b/...educe-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/CachedHistoryStorage.java
@@ -20,12 +20,16 @@
 
 import java.io.IOException;
 import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+import com.google.common.cache.Weigher;
+import com.google.common.util.concurrent.UncheckedExecutionException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -49,9 +53,10 @@ public class CachedHistoryStorage extends AbstractService implements
     HistoryStorage {
   private static final Log LOG = LogFactory.getLog(CachedHistoryStorage.class);
 
-  private Map<JobId, Job> loadedJobCache = null;
-  // The number of loaded jobs.
+  private LoadingCache<JobId, Job> loadedJobCache = null;
   private int loadedJobCacheSize;
+  private int loadedTasksCacheSize;
+  private boolean useLoadedTasksCache;
 
   private HistoryFileManager hsManager;
 
@@ -70,17 +75,70 @@ public void serviceInit(Configuration conf) throws Exception {
 
   @SuppressWarnings("serial")
   private void createLoadedJobCache(Configuration conf) {
+    // Set property for old "loaded jobs" cache
     loadedJobCacheSize = conf.getInt(
         JHAdminConfig.MR_HISTORY_LOADED_JOB_CACHE_SIZE,
         JHAdminConfig.DEFAULT_MR_HISTORY_LOADED_JOB_CACHE_SIZE);
 
-    loadedJobCache = Collections.synchronizedMap(new LinkedHashMap<JobId, Job>(
-        loadedJobCacheSize + 1, 0.75f, true) {
+    // Check property for new "loaded tasks" cache perform sanity checking
+    useLoadedTasksCache = false;
+    try {
+      String taskSizeString = conf
+          .get(JHAdminConfig.MR_HISTORY_LOADED_TASKS_CACHE_SIZE);
+      if (taskSizeString != null) {
+        loadedTasksCacheSize = Math.max(Integer.parseInt(taskSizeString), 1);
+        useLoadedTasksCache = true;
+      }
+    } catch (NumberFormatException nfe) {
+      LOG.error("The property " +
+          JHAdminConfig.MR_HISTORY_LOADED_TASKS_CACHE_SIZE +
+          " is not an integer value.  Please set it to a positive" +
+          " integer value.");
+    }
+
+    CacheLoader<JobId, Job> loader;
+    loader = new CacheLoader<JobId, Job>() {
       @Override
-      public boolean removeEldestEntry(final Map.Entry<JobId, Job> eldest) {
-        return super.size() > loadedJobCacheSize;
+      public Job load(JobId key) throws Exception {
+        return loadJob(key);
       }
-    });
+    };
+
+    if (!useLoadedTasksCache) {
+      loadedJobCache = CacheBuilder.newBuilder()
+          .maximumSize(loadedJobCacheSize)
+          .initialCapacity(loadedJobCacheSize)
+          .concurrencyLevel(1)
+          .build(loader);
+    } else {
+      Weigher<JobId, Job> weightByTasks;
+      weightByTasks = new Weigher<JobId, Job>() {
+        /**
+         * Method for calculating Job weight by total task count.  If
+         * the total task count is greater than the size of the tasks
+         * cache, then cap it at the cache size.  This allows the cache
+         * to always hold one large job.
+         * @param key JobId object
+         * @param value Job object
+         * @return Weight of the job as calculated by total task count
+         */
+        @Override
+        public int weigh(JobId key, Job value) {
+          int taskCount = Math.min(loadedTasksCacheSize,
+              value.getTotalMaps() + value.getTotalReduces());
+          return taskCount;
+        }
+      };
+      // Keep concurrencyLevel at 1.  Otherwise, two problems:
+      // 1) The largest job that can be initially loaded is
+      //    cache size / 4.
+      // 2) Unit tests are not deterministic.
+      loadedJobCache = CacheBuilder.newBuilder()
+          .maximumWeight(loadedTasksCacheSize)
+          .weigher(weightByTasks)
+          .concurrencyLevel(1)
+          .build(loader);
+    }
   }
 
   public void refreshLoadedJobCache() {
@@ -100,52 +158,48 @@ Configuration createConf() {
   public CachedHistoryStorage() {
     super(CachedHistoryStorage.class.getName());
   }
+
+  private static class HSFileRuntimeException extends RuntimeException {
+    public HSFileRuntimeException(String message) {
+      super(message);
+    }
+  }
 
-  private Job loadJob(HistoryFileInfo fileInfo) {
-    try {
-      Job job = fileInfo.loadJob();
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Adding " + job.getID() + " to loaded job cache");
-      }
-      // We can clobber results here, but that should be OK, because it only
-      // means that we may have two identical copies of the same job floating
-      // around for a while.
-      loadedJobCache.put(job.getID(), job);
-      return job;
-    } catch (IOException e) {
-      throw new YarnRuntimeException(
-          "Could not find/load job: " + fileInfo.getJobId(), e);
+  private Job loadJob(JobId jobId) throws RuntimeException, IOException {
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Looking for Job " + jobId);
+    }
+    HistoryFileInfo fileInfo;
+
+    fileInfo = hsManager.getFileInfo(jobId);
+    if (fileInfo == null) {
+      throw new HSFileRuntimeException("Unable to find job " + jobId);
+    } else if (fileInfo.isDeleted()) {
+      throw new HSFileRuntimeException("Cannot load deleted job " + jobId);
+    } else {
+      return fileInfo.loadJob();
     }
   }
 
   @VisibleForTesting
-  Map<JobId, Job> getLoadedJobCache() {
+  Cache<JobId, Job> getLoadedJobCache() {
     return loadedJobCache;
   }
 
   @Override
   public Job getFullJob(JobId jobId) {
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("Looking for Job " + jobId);
-    }
+    Job retVal = null;
     try {
-      HistoryFileInfo fileInfo = hsManager.getFileInfo(jobId);
-      Job result = null;
-      if (fileInfo != null) {
-        result = loadedJobCache.get(jobId);
-        if (result == null) {
-          result = loadJob(fileInfo);
-        } else if(fileInfo.isDeleted()) {
-          loadedJobCache.remove(jobId);
-          result = null;
-        }
+      retVal = loadedJobCache.getUnchecked(jobId);
+    } catch (UncheckedExecutionException e) {
+      if (e.getCause() instanceof HSFileRuntimeException) {
+        LOG.error(e.getCause().getMessage());
+        return null;
       } else {
-        loadedJobCache.remove(jobId);
+        throw new YarnRuntimeException(e.getCause());
       }
-      return result;
-    } catch (IOException e) {
-      throw new YarnRuntimeException(e);
     }
+    return retVal;
   }
 
   @Override
@@ -243,4 +297,14 @@ public static JobsInfo getPartialJobs(Collection<Job> jobs, Long offset,
     }
     return allJobs;
   }
+
+  @VisibleForTesting
+  public boolean getUseLoadedTasksCache() {
+    return useLoadedTasksCache;
+  }
+
+  @VisibleForTesting
+  public int getLoadedTasksCacheSize() {
+    return loadedTasksCacheSize;
+  }
 }