DRILL-3874: flattening large JSON objects uses too much direct memory…

… - add getBufferSizeFor() to ValueVector interface - add implememtations of getBufferSizeFor() for all ValueVector derivatives - add adaptive algorithm for adjusting batch size to flatten operator
apache · Oct 2, 2015 · a3b27c8 · a3b27c8
1 parent af0aff8
commit a3b27c8
Show file tree

Hide file tree

Showing 16 changed files with 311 additions and 66 deletions.
diff --git a/exec/java-exec/src/main/codegen/templates/FixedValueVectors.java b/exec/java-exec/src/main/codegen/templates/FixedValueVectors.java
@@ -59,6 +59,14 @@ public FieldReader getReader(){
     return reader;
   }
 
+  @Override
+  public int getBufferSizeFor(final int valueCount) {
+    if (valueCount == 0) {
+      return 0;
+    }
+    return valueCount * ${type.width};
+  }
+
   @Override
   public int getValueCapacity(){
     return (int) (data.capacity() *1.0 / ${type.width});

diff --git a/exec/java-exec/src/main/codegen/templates/NullableValueVectors.java b/exec/java-exec/src/main/codegen/templates/NullableValueVectors.java
@@ -102,6 +102,16 @@ public int getBufferSize(){
     return values.getBufferSize() + bits.getBufferSize();
   }
 
+  @Override
+  public int getBufferSizeFor(final int valueCount) {
+    if (valueCount == 0) {
+      return 0;
+    }
+
+    return values.getBufferSizeFor(valueCount)
+        + bits.getBufferSizeFor(valueCount);
+  }
+
   @Override
   public DrillBuf getBuffer() {
     return values.getBuffer();

diff --git a/exec/java-exec/src/main/codegen/templates/VariableLengthVectors.java b/exec/java-exec/src/main/codegen/templates/VariableLengthVectors.java
@@ -88,6 +88,16 @@ public int getBufferSize(){
     return offsetVector.getBufferSize() + data.writerIndex();
   }
 
+  @Override
+  public int getBufferSizeFor(final int valueCount) {
+    if (valueCount == 0) {
+      return 0;
+    }
+
+    final int idx = offsetVector.getAccessor().get(valueCount);
+    return offsetVector.getBufferSizeFor(valueCount + 1) + idx;
+  }
+
   @Override
   public int getValueCapacity(){
     return Math.max(offsetVector.getValueCapacity() - 1, 0);
@@ -302,6 +312,7 @@ public boolean allocateNewSafe() {
     try {
       final int requestedSize = (int)curAllocationSize;
       data = allocator.buffer(requestedSize);
+      allocationSizeInBytes = requestedSize;
       offsetVector.allocateNew();
     } catch (OutOfMemoryRuntimeException e) {
       clear();

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/memory/TopLevelAllocator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/memory/TopLevelAllocator.java
@@ -51,18 +51,6 @@ public class TopLevelAllocator implements BufferAllocator {
   private final DrillBuf empty;
   private final DrillConfig config;
 
-  /* TODO(cwestin) remove
-  @Deprecated
-  TopLevelAllocator() {
-    this(DrillConfig.getMaxDirectMemory());
-  }
-
-  @Deprecated
-  TopLevelAllocator(long maximumAllocation) {
-    this(null, maximumAllocation, true);
-  }
-  */
-
   private TopLevelAllocator(DrillConfig config, long maximumAllocation, boolean errorOnLeak){
     MAXIMUM_DIRECT_MEMORY = maximumAllocation;
     this.config=(config!=null) ? config : DrillConfig.create();

diff --git a/...va-exec/src/main/java/org/apache/drill/exec/physical/impl/flatten/FlattenRecordBatch.java b/...va-exec/src/main/java/org/apache/drill/exec/physical/impl/flatten/FlattenRecordBatch.java
@@ -21,17 +21,16 @@
 import java.util.List;
 
 import com.carrotsearch.hppc.IntOpenHashSet;
+
 import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.expression.ErrorCollector;
 import org.apache.drill.common.expression.ErrorCollectorImpl;
 import org.apache.drill.common.expression.FieldReference;
 import org.apache.drill.common.expression.LogicalExpression;
-import org.apache.drill.common.expression.PathSegment;
 import org.apache.drill.common.logical.data.NamedExpression;
 import org.apache.drill.exec.exception.ClassTransformationException;
 import org.apache.drill.exec.exception.SchemaChangeException;
 import org.apache.drill.exec.expr.ClassGenerator;
-import org.apache.drill.exec.expr.ClassGenerator.HoldingContainer;
 import org.apache.drill.exec.expr.CodeGenerator;
 import org.apache.drill.exec.expr.DrillFuncHolderExpr;
 import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
@@ -61,29 +60,32 @@
 // TODO - handle the case where a user tries to flatten a scalar, should just act as a project all of the columns exactly
 // as they come in
 public class FlattenRecordBatch extends AbstractSingleRecordBatch<FlattenPOP> {
-  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(FlattenRecordBatch.class);
+  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(FlattenRecordBatch.class);
 
   private Flattener flattener;
   private List<ValueVector> allocationVectors;
   private List<ComplexWriter> complexWriters;
   private boolean hasRemainder = false;
   private int remainderIndex = 0;
   private int recordCount;
-  // the buildSchema method is always called first by a short circuit path to return schema information to the client
+
-  // this information is not entirely accurate as Drill determines schema on the fly, so here it needs to have modified
+  private final Flattener.Monitor monitor = new Flattener.Monitor() {
-  // behavior for that call to setup the schema for the flatten operation
+    @Override
-  private boolean fastSchemaCalled;
+    public int getBufferSizeFor(int recordCount) {
+      int bufferSize = 0;
+      for(final ValueVector vv : allocationVectors) {
+        bufferSize += vv.getBufferSizeFor(recordCount);
+      }
+      return bufferSize;
+    }
+  };
 
   private static final String EMPTY_STRING = "";
 
   private class ClassifierResult {
-    public boolean isStar = false;
     public List<String> outputNames;
-    public String prefix = "";
 
     private void clear() {
-      isStar = false;
-      prefix = "";
       if (outputNames != null) {
         outputNames.clear();
       }
@@ -94,7 +96,6 @@ private void clear() {
 
   public FlattenRecordBatch(FlattenPOP pop, RecordBatch incoming, FragmentContext context) throws OutOfMemoryException {
     super(pop, context, incoming);
-    fastSchemaCalled = false;
   }
 
   @Override
@@ -150,7 +151,7 @@ protected IterOutcome doWork() {
     setFlattenVector();
 
     int childCount = incomingRecordCount == 0 ? 0 : flattener.getFlattenField().getAccessor().getInnerValueCount();
-    int outputRecords = flattener.flattenRecords(incomingRecordCount, 0);
+    int outputRecords = flattener.flattenRecords(incomingRecordCount, 0, monitor);
     // TODO - change this to be based on the repeated vector length
     if (outputRecords < childCount) {
       setValueCount(outputRecords);
@@ -181,7 +182,7 @@ private void handleRemainder() {
       return;
     }
 
-    int projRecords = flattener.flattenRecords(remainingRecordCount, 0);
+    int projRecords = flattener.flattenRecords(remainingRecordCount, 0, monitor);
     if (projRecords < remainingRecordCount) {
       setValueCount(projRecords);
       this.recordCount = projRecords;
@@ -243,9 +244,7 @@ private void setValueCount(int count) {
   }
 
   private FieldReference getRef(NamedExpression e) {
-    FieldReference ref = e.getRef();
+    final FieldReference ref = e.getRef();
-    PathSegment seg = ref.getRootSegment();
-
     return ref;
   }
 
@@ -261,7 +260,7 @@ private FieldReference getRef(NamedExpression e) {
    */
   private TransferPair getFlattenFieldTransferPair(FieldReference reference) {
     final TypedFieldId fieldId = incoming.getValueVectorId(popConfig.getColumn());
-    final Class vectorClass = incoming.getSchema().getColumn(fieldId.getFieldIds()[0]).getValueClass();
+    final Class<?> vectorClass = incoming.getSchema().getColumn(fieldId.getFieldIds()[0]).getValueClass();
     final ValueVector flattenField = incoming.getValueAccessorById(vectorClass, fieldId.getFieldIds()).getValueVector();
 
     TransferPair tp = null;
@@ -338,7 +337,7 @@ protected boolean setupNewSchema() throws SchemaChangeException {
         allocationVectors.add(vector);
         TypedFieldId fid = container.add(vector);
         ValueVectorWriteExpression write = new ValueVectorWriteExpression(fid, expr, true);
-        HoldingContainer hc = cg.addExpr(write);
+        cg.addExpr(write);
 
         logger.debug("Added eval for project expression.");
       }
@@ -369,5 +368,4 @@ private List<NamedExpression> getExpressionList() {
     }
     return exprs;
   }
-
 }